def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets.""" with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) # the image normalization is identical to Cloud TPU ResNet-50 image = tf.image.convert_image_dtype(image, dtype=tf.float32) image = _normalize_image(image) if params['input_rand_hflip']: image, boxes = preprocessor.random_horizontal_flip(image, boxes=boxes) image_original_shape = tf.shape(image) image, _ = preprocessor.resize_to_range( image, min_dimension=params['image_size'], max_dimension=params['image_size']) image_scale = tf.to_float(image_original_shape[0]) / tf.to_float( tf.shape(image)[0]) image, boxes = preprocessor.scale_boxes_to_pixel_coordinates( image, boxes, keypoints=None) image = tf.image.pad_to_bounding_box(image, 0, 0, params['image_size'], params['image_size']) (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors(boxes, classes) source_id = tf.string_to_number(source_id, out_type=tf.float32) row = (image, cls_targets, box_targets, num_positives, source_id, image_scale) return row
def _preprocess(example): """ example: a string tensor holding serialized tf example proto """ data = example_decoder.decode(example) image = data["image"] boxes = data["groundtruth_boxes"] labels = data["groundtruth_classes"] true_image_width = data["image/width"] true_image_height = data["image/height"] # augmentation if is_training: image = preprocessor.random_adjust_brightness(image) image = preprocessor.random_adjust_saturation(image) image = tf.image.convert_image_dtype(image, dtype=tf.float32) image, boxes = preprocessor.random_horizontal_flip(image, boxes) image, boxes, labels = preprocessor.random_crop_image( image, boxes, labels) else: image = tf.image.convert_image_dtype(image, dtype=tf.float32) # resize image image = tf.image.resize_images(image, size=params["image_size"]) image = (image - 0.5) * 2 boxes = pad_to_fixed_size(boxes, [params["max_instance"], 4], -1) labels = pad_to_fixed_size(labels, [params["max_instance"], 1], -1) image_shape = tf.stack([true_image_height, true_image_width], axis=0) return (image, boxes, labels, image_shape)
def random_horizontal_flip(self): """Randomly flip input image and bounding boxes.""" results = preprocessor.random_horizontal_flip( self._image, boxes=self._boxes, masks=self._masks) self._image = results[0] self._boxes = results[1] if self._masks is not None: self._masks = results[2]
def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets.""" with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] # for xView dataset only; basically the original name is 122.tif and we will change it to number 122 later on. # len = tf.size(tf.string_split([data['source_id']],"")) # source_id = tf.substr(data['source_id'],0,len - 4) image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) # Handle crowd annotations. As crowd annotations are not large # instances, the model ignores them in training. if params['skip_crowd']: indices = tf.where(tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) # the image normalization is identical to Cloud TPU ResNet-50 image = tf.image.convert_image_dtype(image, dtype=tf.float32) image = _normalize_image(image) if params['input_rand_hflip']: image, boxes = preprocessor.random_horizontal_flip(image, boxes=boxes) image_original_shape = tf.shape(image) image, _ = preprocessor.resize_to_range( image, min_dimension=params['image_size'], max_dimension=params['image_size']) image_scale = tf.to_float(image_original_shape[0]) / tf.to_float( tf.shape(image)[0]) image, boxes = preprocessor.scale_boxes_to_pixel_coordinates( image, boxes, keypoints=None) image = tf.image.pad_to_bounding_box(image, 0, 0, params['image_size'], params['image_size']) (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors(boxes, classes) # # sess = tf.get_default_session() # print("source id is", sess.run(source_id)) source_id = tf.string_to_number(source_id, out_type=tf.float32) # sess = tf.get_default_session() # print("after conversion, source id is", sess.run(source_id)) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) row = (image, cls_targets, box_targets, num_positives, source_id, image_scale) return row
def random_horizontal_flip(image, boxes=None, masks=None): """Random horizontal flip the image, boxes, and masks. Args: image: a tensor of shape [height, width, 3] representing the image. boxes: (Optional) a tensor of shape [num_boxes, 4] represneting the box corners in normalized coordinates. masks: (Optional) a tensor of shape [num_masks, height, width] representing the object masks. Note that the size of the mask is the same as the image. Returns: image: the processed image tensor after being randomly flipped. boxes: None or the processed box tensor after being randomly flipped. masks: None or the processed mask tensor after being randomly flipped. """ return preprocessor.random_horizontal_flip(image, boxes, masks)
def random_horizontal_flip(self): """Randomly flip input image and bounding boxes.""" self._image, self._boxes = preprocessor.random_horizontal_flip( self._image, boxes=self._boxes)
def random_horizontal_flip(self): """Randomly flip input image and segmentation label.""" self._label = tf.expand_dims(self._label, 0) self._image, self._label = preprocessor.random_horizontal_flip( self._image, masks=self._label) self._label = self._label[0, :, :]
def _parse_example(data): with tf.name_scope('augmentation'): source_id = data['source_id'] image = tf.image.convert_image_dtype(data['image'], dtype=tf.float32) raw_shape = tf.shape(image) boxes = data['groundtruth_boxes'] classes = tf.reshape(data['groundtruth_classes'], [-1, 1]) # Only 80 of the 90 COCO classes are used. class_map = tf.convert_to_tensor(ssd_constants.CLASS_MAP) classes = tf.gather(class_map, classes) classes = tf.cast(classes, dtype=tf.float32) if self._is_training: image, boxes, classes = ssd_crop(image, boxes, classes) # random_horizontal_flip() is hard coded to flip with 50% chance. mlperf_log.ssd_print( key=mlperf_log.RANDOM_FLIP_PROBABILITY, value=0.5) image, boxes = preprocessor.random_horizontal_flip( image=image, boxes=boxes) # TODO(shibow): Investigate the parameters for color jitter. image = color_jitter(image, brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05) image = normalize_image(image) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) encoded_classes, encoded_boxes, num_matched_boxes = encode_labels( boxes, classes) # TODO(taylorrobie): Check that this cast is valid. encoded_classes = tf.cast(encoded_classes, tf.int32) labels = { ssd_constants.NUM_MATCHED_BOXES: num_matched_boxes, ssd_constants.BOXES: encoded_boxes, ssd_constants.CLASSES: encoded_classes, } # This is for dataloader visualization; actual model doesn't use this. if params['visualize_dataloader']: box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder( scale_factors=ssd_constants.BOX_CODER_SCALES) decoded_boxes = tf.expand_dims(box_coder.decode( rel_codes=tf.squeeze(encoded_boxes), anchors=box_list.BoxList( tf.convert_to_tensor( DefaultBoxes()('ltrb')))).get(), axis=0) labels['decoded_boxes'] = tf.squeeze(decoded_boxes) return image, labels else: mlperf_log.ssd_print(key=mlperf_log.INPUT_SIZE, value=ssd_constants.IMAGE_SIZE) image = tf.image.resize_images( image[tf.newaxis, :, :, :], size=(ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE))[0, :, :, :] image = normalize_image(image) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) def trim_and_pad(inp_tensor, dim_1): """Limit the number of boxes, and pad if necessary.""" inp_tensor = inp_tensor[:ssd_constants. MAX_NUM_EVAL_BOXES] num_pad = ssd_constants.MAX_NUM_EVAL_BOXES - tf.shape( inp_tensor)[0] inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]]) return tf.reshape( inp_tensor, [ssd_constants.MAX_NUM_EVAL_BOXES, dim_1]) boxes, classes = trim_and_pad(boxes, 4), trim_and_pad(classes, 1) return { ssd_constants.IMAGE: image, ssd_constants.BOXES: boxes, ssd_constants.CLASSES: classes, ssd_constants.SOURCE_ID: tf.string_to_number(source_id, tf.int32), ssd_constants.RAW_SHAPE: raw_shape, }
def _parse_example(data): with tf.name_scope('augmentation'): source_id = data['source_id'] image = data['image'] # dtype uint8 raw_shape = tf.shape(image) boxes = data['groundtruth_boxes'] classes = tf.reshape(data['groundtruth_classes'], [-1, 1]) # Only 80 of the 90 COCO classes are used. class_map = tf.convert_to_tensor(ssd_constants.CLASS_MAP) classes = tf.gather(class_map, classes) classes = tf.cast(classes, dtype=tf.float32) if self._is_training: image, boxes, classes = ssd_crop(image, boxes, classes) # ssd_crop resizes and returns image of dtype float32 and does not # change its range (i.e., value in between 0--255). Divide by 255. # converts it to [0, 1] range. Not doing this before cropping to # avoid dtype cast (which incurs additional memory copy). image /= 255.0 # random_horizontal_flip() is hard coded to flip with 50% chance. image, boxes = preprocessor.random_horizontal_flip( image=image, boxes=boxes) # TODO(shibow): Investigate the parameters for color jitter. image = color_jitter(image, brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) encoded_classes, encoded_boxes, num_matched_boxes = encode_labels( boxes, classes) # TODO(taylorrobie): Check that this cast is valid. encoded_classes = tf.cast(encoded_classes, tf.int32) labels = { ssd_constants.NUM_MATCHED_BOXES: num_matched_boxes, ssd_constants.BOXES: encoded_boxes, ssd_constants.CLASSES: tf.squeeze(encoded_classes, axis=1), } # This is for dataloader visualization; actual model doesn't use this. if params['visualize_dataloader']: box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder( scale_factors=ssd_constants.BOX_CODER_SCALES) decoded_boxes = tf.expand_dims(box_coder.decode( rel_codes=tf.squeeze(encoded_boxes), anchors=box_list.BoxList( tf.convert_to_tensor( DefaultBoxes()('ltrb')))).get(), axis=0) labels['decoded_boxes'] = tf.squeeze(decoded_boxes) return image, labels else: image = tf.image.resize_images( image, size=(ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE)) # resize_image returns image of dtype float32 and does not change its # range. Divide by 255 to convert image to [0, 1] range. image /= 255. if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) def trim_and_pad(inp_tensor, dim_1): """Limit the number of boxes, and pad if necessary.""" inp_tensor = inp_tensor[:ssd_constants. MAX_NUM_EVAL_BOXES] num_pad = ssd_constants.MAX_NUM_EVAL_BOXES - tf.shape( inp_tensor)[0] inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]]) return tf.reshape( inp_tensor, [ssd_constants.MAX_NUM_EVAL_BOXES, dim_1]) boxes, classes = trim_and_pad(boxes, 4), trim_and_pad(classes, 1) sample = { ssd_constants.IMAGE: image, ssd_constants.BOXES: boxes, ssd_constants.CLASSES: classes, ssd_constants.SOURCE_ID: tf.string_to_number(source_id, tf.int32), ssd_constants.RAW_SHAPE: raw_shape, } if not self._is_training and self._count > params[ 'eval_samples']: sample[ssd_constants.IS_PADDED] = data[ ssd_constants.IS_PADDED] return sample
def random_horizontal_flip(self): """Randomly flip input image and bounding boxes.""" self._label = tf.expand_dims(self._label, 0) self._image, self._boxes, self._label = preprocessor.random_horizontal_flip( self._image, boxes=self._boxes, masks=self._label) self._label = self._label[0, :, :]