def extract_images_and_targets(read_data): """Extract images and targets from the input dict.""" image = read_data[fields.InputDataFields.image] key = '' if fields.InputDataFields.source_id in read_data: key = read_data[fields.InputDataFields.source_id] location_gt = read_data[fields.InputDataFields.groundtruth_boxes] classes_gt = tf.cast(read_data[fields.InputDataFields.groundtruth_classes], tf.int32) classes_gt -= label_id_offset if merge_multiple_label_boxes and use_multiclass_scores: raise ValueError( 'Using both merge_multiple_label_boxes and use_multiclass_scores is' 'not supported' ) if merge_multiple_label_boxes: location_gt, classes_gt, _ = util_ops.merge_boxes_with_multiple_labels( location_gt, classes_gt, num_classes) elif use_multiclass_scores: classes_gt = tf.cast(read_data[fields.InputDataFields.multiclass_scores], tf.float32) else: classes_gt = util_ops.padded_one_hot_encoding( indices=classes_gt, depth=num_classes, left_pad=0) masks_gt = read_data.get(fields.InputDataFields.groundtruth_instance_masks) keypoints_gt = read_data.get(fields.InputDataFields.groundtruth_keypoints) if (merge_multiple_label_boxes and ( masks_gt is not None or keypoints_gt is not None)): raise NotImplementedError('Multi-label support is only for boxes.') weights_gt = read_data.get( fields.InputDataFields.groundtruth_weights) return (image, key, location_gt, classes_gt, masks_gt, keypoints_gt, weights_gt)
def extract_images_and_targets(read_data): """Extract images and targets from the input dict.""" image = read_data[fields.InputDataFields.image] key = '' if fields.InputDataFields.source_id in read_data: key = read_data[fields.InputDataFields.source_id] location_gt = read_data[fields.InputDataFields.groundtruth_boxes] classes_gt = tf.cast( read_data[fields.InputDataFields.groundtruth_classes], tf.int32) classes_gt -= label_id_offset if merge_multiple_label_boxes: location_gt, classes_gt, _ = util_ops.merge_boxes_with_multiple_labels( location_gt, classes_gt, num_classes) else: classes_gt = util_ops.padded_one_hot_encoding(indices=classes_gt, depth=num_classes, left_pad=0) masks_gt = read_data.get( fields.InputDataFields.groundtruth_instance_masks) keypoints_gt = read_data.get( fields.InputDataFields.groundtruth_keypoints) if (merge_multiple_label_boxes and (masks_gt is not None or keypoints_gt is not None)): raise NotImplementedError('Multi-label support is only for boxes.') return image, key, location_gt, classes_gt, masks_gt, keypoints_gt
def transform_input_data(tensor_dict, model_preprocess_fn, image_resizer_fn, num_classes, data_augmentation_fn=None, merge_multiple_boxes=False, retain_original_image=False): # if fields.InputDataFields.groundtruth_boxes in tensor_dict: # tensor_dict = util_ops.filter_groundtruth_with_nan_box_coordinates( # tensor_dict) if fields.InputDataFields.image_additional_channels in tensor_dict: channels = tensor_dict[ fields.InputDataFields.image_additional_channels] tensor_dict[fields.InputDataFields.image] = tf.concat( [tensor_dict[fields.InputDataFields.image], channels], axis=2) # Apply data augmentation ops. if data_augmentation_fn is not None: tensor_dict = data_augmentation_fn(tensor_dict) # Apply model preprocessing ops and resize instance masks. query = tensor_dict['query'] preprocessed_resized_image, true_image_shape = resize_image( query, new_height=FLAGS.im_size, new_width=FLAGS.im_size) tensor_dict['query'] = preprocessed_resized_image tensor_dict['query_shape'] = true_image_shape ref = tensor_dict['ref'] preprocessed_resized_image, true_image_shape = resize_image( ref, new_height=FLAGS.im_size, new_width=FLAGS.im_size) tensor_dict['ref'] = preprocessed_resized_image tensor_dict[fields.InputDataFields.true_image_shape] = true_image_shape if retain_original_image: tensor_dict[fields.InputDataFields. original_image] = tf.image.convert_image_dtype( tensor_dict['ref'][0] / 2 + 0.5, tf.uint8) # Transform groundtruth classes to one hot encodings. zero_indexed_groundtruth_classes = tensor_dict[ fields.InputDataFields.groundtruth_classes] tensor_dict[fields.InputDataFields.groundtruth_classes] = tf.one_hot( zero_indexed_groundtruth_classes, num_classes) if merge_multiple_boxes: merged_boxes, merged_classes, _ = util_ops.merge_boxes_with_multiple_labels( tensor_dict[fields.InputDataFields.groundtruth_boxes], zero_indexed_groundtruth_classes, num_classes) merged_classes = tf.cast(merged_classes, tf.float32) tensor_dict[fields.InputDataFields.groundtruth_boxes] = merged_boxes tensor_dict[ fields.InputDataFields.groundtruth_classes] = merged_classes return tensor_dict
def testMergeBoxesWithEmptyInputs(self): boxes = tf.constant([[]]) class_indices = tf.constant([]) num_classes = 5 merged_boxes, merged_classes, merged_box_indices = ( ops.merge_boxes_with_multiple_labels(boxes, class_indices, num_classes)) with self.test_session() as sess: np_merged_boxes, np_merged_classes, np_merged_box_indices = sess.run( [merged_boxes, merged_classes, merged_box_indices]) self.assertAllEqual(np_merged_boxes.shape, [0, 4]) self.assertAllEqual(np_merged_classes.shape, [0, 5]) self.assertAllEqual(np_merged_box_indices.shape, [0])
def testMergeBoxesWithEmptyInputs(self): boxes = tf.constant([[]]) class_indices = tf.constant([]) num_classes = 5 merged_boxes, merged_classes, merged_box_indices = ( ops.merge_boxes_with_multiple_labels(boxes, class_indices, num_classes)) with self.test_session() as sess: np_merged_boxes, np_merged_classes, np_merged_box_indices = sess.run( [merged_boxes, merged_classes, merged_box_indices]) self.assertAllEqual(np_merged_boxes.shape, [0, 4]) self.assertAllEqual(np_merged_classes.shape, [0, 5]) self.assertAllEqual(np_merged_box_indices.shape, [0])
def extract_images_and_targets(read_data): """Extract images and targets from the input dict.""" suffix = 0 images = [] keys = [] locations = [] classes = [] masks = [] keypoints = [] while fields.InputDataFields.image + str(suffix) in read_data: image = read_data[fields.InputDataFields.image + str(suffix)] key = '' if fields.InputDataFields.source_id in read_data: key = read_data[fields.InputDataFields.source_id + str(suffix)] location_gt = (read_data[fields.InputDataFields.groundtruth_boxes + str(suffix)]) classes_gt = tf.cast( read_data[fields.InputDataFields.groundtruth_classes + str(suffix)], tf.int32) classes_gt -= label_id_offset masks_gt = read_data.get( fields.InputDataFields.groundtruth_instance_masks + str(suffix)) keypoints_gt = read_data.get( fields.InputDataFields.groundtruth_keypoints + str(suffix)) if merge_multiple_label_boxes: location_gt, classes_gt, _ = util_ops.merge_boxes_with_multiple_labels( location_gt, classes_gt, num_classes) else: classes_gt = util_ops.padded_one_hot_encoding( indices=classes_gt, depth=num_classes, left_pad=0) # Batch read input data and groundtruth. Images and locations, classes by # default should have the same number of items. images.append(image) keys.append(key) locations.append(location_gt) classes.append(classes_gt) masks.append(masks_gt) keypoints.append(keypoints_gt) suffix += 1 return (images, keys, locations, classes, masks, keypoints)
def extract_images_and_targets(read_data): """Extract images and targets from the input dict.""" suffix = 0 images = [] keys = [] locations = [] classes = [] masks = [] keypoints = [] while fields.InputDataFields.image + str(suffix) in read_data: image = read_data[fields.InputDataFields.image + str(suffix)] key = '' if fields.InputDataFields.source_id in read_data: key = read_data[fields.InputDataFields.source_id + str(suffix)] location_gt = ( read_data[fields.InputDataFields.groundtruth_boxes + str(suffix)]) classes_gt = tf.cast( read_data[fields.InputDataFields.groundtruth_classes + str(suffix)], tf.int32) classes_gt -= label_id_offset masks_gt = read_data.get( fields.InputDataFields.groundtruth_instance_masks + str(suffix)) keypoints_gt = read_data.get( fields.InputDataFields.groundtruth_keypoints + str(suffix)) if merge_multiple_label_boxes: location_gt, classes_gt, _ = util_ops.merge_boxes_with_multiple_labels( location_gt, classes_gt, num_classes) else: classes_gt = util_ops.padded_one_hot_encoding( indices=classes_gt, depth=num_classes, left_pad=0) # Batch read input data and groundtruth. Images and locations, classes by # default should have the same number of items. images.append(image) keys.append(key) locations.append(location_gt) classes.append(classes_gt) masks.append(masks_gt) keypoints.append(keypoints_gt) suffix += 1 return (images, keys, locations, classes, masks, keypoints)
def extract_images_and_targets(read_data): """Extract images and targets from the input dict.""" image = read_data[fields.InputDataFields.image] key = '' if fields.InputDataFields.source_id in read_data: key = read_data[fields.InputDataFields.source_id] location_gt = read_data[fields.InputDataFields.groundtruth_boxes] classes_gt = tf.cast(read_data[fields.InputDataFields.groundtruth_classes], tf.int32) classes_gt -= label_id_offset classes_in_image_level_gt = tf.cast(read_data[fields.InputDataFields.groundtruth_image_classes], tf.int32) # image-level class does not have background class # thus, id starts from 1 classes_in_image_level_gt -= label_id_offset # audio audio = read_data[fields.InputDataFields.audio] if merge_multiple_label_boxes: location_gt, classes_gt, _ = util_ops.merge_boxes_with_multiple_labels( location_gt, classes_gt, num_classes) else: classes_gt = util_ops.padded_one_hot_encoding( indices=classes_gt, depth=num_classes, left_pad=0) """ classes_in_image_level_gt = tf.Print(classes_in_image_level_gt, [classes_in_image_level_gt], "classes_in_image_level_gt: ") """ # multi-label classfication, so we need k-hot encoding classes_in_image_level_gt = util_ops.padded_one_hot_encoding( indices=classes_in_image_level_gt, depth=num_classes_in_image_level, left_pad=0) classes_in_image_level_gt = tf.reduce_sum(classes_in_image_level_gt, 0) masks_gt = read_data.get(fields.InputDataFields.groundtruth_instance_masks) keypoints_gt = read_data.get(fields.InputDataFields.groundtruth_keypoints) if (merge_multiple_label_boxes and ( masks_gt is not None or keypoints_gt is not None)): raise NotImplementedError('Multi-label support is only for boxes.') #return image, key, location_gt, classes_gt, masks_gt, keypoints_gt return image, audio, key, location_gt, classes_gt, classes_in_image_level_gt, masks_gt, keypoints_gt
def extract_images_and_targets(read_data): """Extract images and targets from the input dict.""" image = read_data[fields.InputDataFields.image] key = '' if fields.InputDataFields.source_id in read_data: key = read_data[fields.InputDataFields.source_id] location_gt = read_data[fields.InputDataFields.groundtruth_boxes] if cfg.HAS_RPN: rpn_boxes = read_data['rpn_boxes'] else: rpn_boxes = location_gt rpn_class = tf.ones_like(rpn_boxes, dtype=tf.int32) rpn_class = rpn_class[:, :2] classes_gt = tf.cast( read_data[fields.InputDataFields.groundtruth_classes], tf.int32) classes_gt -= label_id_offset if merge_multiple_label_boxes and use_multiclass_scores: raise ValueError( 'Using both merge_multiple_label_boxes and use_multiclass_scores is' 'not supported') if merge_multiple_label_boxes: location_gt, classes_gt, _ = util_ops.merge_boxes_with_multiple_labels( location_gt, classes_gt, num_classes) classes_gt = tf.cast(classes_gt, tf.float32) elif use_multiclass_scores: classes_gt = tf.cast( read_data[fields.InputDataFields.multiclass_scores], tf.float32) else: classes_gt = util_ops.padded_one_hot_encoding(indices=classes_gt, depth=num_classes, left_pad=0) masks_gt = read_data.get( fields.InputDataFields.groundtruth_instance_masks) keypoints_gt = read_data.get( fields.InputDataFields.groundtruth_keypoints) if (merge_multiple_label_boxes and (masks_gt is not None or keypoints_gt is not None)): raise NotImplementedError('Multi-label support is only for boxes.') weights_gt = read_data.get(fields.InputDataFields.groundtruth_weights) return (image, key, location_gt, classes_gt, masks_gt, keypoints_gt, weights_gt, rpn_boxes, rpn_class)
def extract_images_and_targets(read_data): """Extract images and targets from the input dict.""" image = read_data[fields.InputDataFields.image] # image = tf.Print(image,[image]) # image = tf.Print(image,["<<", read_data[fields.InputDataFields.filename], read_data[fields.InputDataFields.groundtruth_boxes],read_data[fields.InputDataFields.groundtruth_area],tf.shape(read_data[fields.InputDataFields.groundtruth_area])]) # image = tf.Print(image,["<<", read_data[fields.InputDataFields.filename]]) area = read_data[fields.InputDataFields.groundtruth_area] key = '' if fields.InputDataFields.source_id in read_data: key = read_data[fields.InputDataFields.source_id] location_gt = read_data[fields.InputDataFields.groundtruth_boxes] classes_gt = tf.cast( read_data[fields.InputDataFields.groundtruth_classes], tf.int32) classes_gt -= label_id_offset if merge_multiple_label_boxes and use_multiclass_scores: raise ValueError( 'Using both merge_multiple_label_boxes and use_multiclass_scores is' 'not supported') if merge_multiple_label_boxes: location_gt, classes_gt, _ = util_ops.merge_boxes_with_multiple_labels( location_gt, classes_gt, num_classes) elif use_multiclass_scores: classes_gt = tf.cast( read_data[fields.InputDataFields.multiclass_scores], tf.float32) else: classes_gt = util_ops.padded_one_hot_encoding(indices=classes_gt, depth=num_classes, left_pad=0) masks_gt = read_data.get( fields.InputDataFields.groundtruth_instance_masks) keypoints_gt = read_data.get( fields.InputDataFields.groundtruth_keypoints) if (merge_multiple_label_boxes and (masks_gt is not None or keypoints_gt is not None)): raise NotImplementedError('Multi-label support is only for boxes.') weights_gt = read_data.get(fields.InputDataFields.groundtruth_weights) print(read_data) print(masks_gt, '000<<<<') return (image, area, key, location_gt, classes_gt, masks_gt, keypoints_gt, weights_gt)
def testMergeBoxesWithMultipleLabels(self): boxes = tf.constant( [[0.25, 0.25, 0.75, 0.75], [0.0, 0.0, 0.5, 0.75], [0.25, 0.25, 0.75, 0.75]], dtype=tf.float32) class_indices = tf.constant([0, 4, 2], dtype=tf.int32) num_classes = 5 merged_boxes, merged_classes, merged_box_indices = ( ops.merge_boxes_with_multiple_labels(boxes, class_indices, num_classes)) expected_merged_boxes = np.array( [[0.25, 0.25, 0.75, 0.75], [0.0, 0.0, 0.5, 0.75]], dtype=np.float32) expected_merged_classes = np.array( [[1, 0, 1, 0, 0], [0, 0, 0, 0, 1]], dtype=np.int32) expected_merged_box_indices = np.array([0, 1], dtype=np.int32) with self.test_session() as sess: np_merged_boxes, np_merged_classes, np_merged_box_indices = sess.run( [merged_boxes, merged_classes, merged_box_indices]) if np_merged_classes[0, 0] != 1: expected_merged_boxes = expected_merged_boxes[::-1, :] expected_merged_classes = expected_merged_classes[::-1, :] expected_merged_box_indices = expected_merged_box_indices[::-1, :] self.assertAllClose(np_merged_boxes, expected_merged_boxes) self.assertAllClose(np_merged_classes, expected_merged_classes) self.assertAllClose(np_merged_box_indices, expected_merged_box_indices)
def testMergeBoxesWithMultipleLabels(self): boxes = tf.constant( [[0.25, 0.25, 0.75, 0.75], [0.0, 0.0, 0.5, 0.75], [0.25, 0.25, 0.75, 0.75]], dtype=tf.float32) class_indices = tf.constant([0, 4, 2], dtype=tf.int32) num_classes = 5 merged_boxes, merged_classes, merged_box_indices = ( ops.merge_boxes_with_multiple_labels(boxes, class_indices, num_classes)) expected_merged_boxes = np.array( [[0.25, 0.25, 0.75, 0.75], [0.0, 0.0, 0.5, 0.75]], dtype=np.float32) expected_merged_classes = np.array( [[1, 0, 1, 0, 0], [0, 0, 0, 0, 1]], dtype=np.int32) expected_merged_box_indices = np.array([0, 1], dtype=np.int32) with self.test_session() as sess: np_merged_boxes, np_merged_classes, np_merged_box_indices = sess.run( [merged_boxes, merged_classes, merged_box_indices]) if np_merged_classes[0, 0] != 1: expected_merged_boxes = expected_merged_boxes[::-1, :] expected_merged_classes = expected_merged_classes[::-1, :] expected_merged_box_indices = expected_merged_box_indices[::-1, :] self.assertAllClose(np_merged_boxes, expected_merged_boxes) self.assertAllClose(np_merged_classes, expected_merged_classes) self.assertAllClose(np_merged_box_indices, expected_merged_box_indices)
def transform_input_data(tensor_dict, model_preprocess_fn, image_resizer_fn, num_classes, data_augmentation_fn=None, merge_multiple_boxes=False, retain_original_image=False): """A single function that is responsible for all input data transformations. Data transformation functions are applied in the following order. 1. If key fields.InputDataFields.image_additional_channels is present in tensor_dict, the additional channels will be merged into fields.InputDataFields.image. 2. data_augmentation_fn (optional): applied on tensor_dict. 3. model_preprocess_fn: applied only on image tensor in tensor_dict. 4. image_resizer_fn: applied on original image and instance mask tensor in tensor_dict. 5. one_hot_encoding: applied to classes tensor in tensor_dict. 6. merge_multiple_boxes (optional): when groundtruth boxes are exactly the same they can be merged into a single box with an associated k-hot class label. Args: tensor_dict: dictionary containing input tensors keyed by fields.InputDataFields. model_preprocess_fn: model's preprocess function to apply on image tensor. This function must take in a 4-D float tensor and return a 4-D preprocess float tensor and a tensor containing the true image shape. image_resizer_fn: image resizer function to apply on groundtruth instance `masks. This function must take a 3-D float tensor of an image and a 3-D tensor of instance masks and return a resized version of these along with the true shapes. num_classes: number of max classes to one-hot (or k-hot) encode the class data. data_augmentation_fn: (optional) data augmentation function to apply on input `tensor_dict`. merge_multiple_boxes: (optional) whether to merge multiple groundtruth boxes and classes for a given image if the boxes are exactly the same. retain_original_image: (optional) whether to retain original image in the output dictionary. Returns: A dictionary keyed by fields.InputDataFields containing the tensors obtained after applying all the transformations. """ if fields.InputDataFields.groundtruth_boxes in tensor_dict: tensor_dict = util_ops.filter_groundtruth_with_nan_box_coordinates( tensor_dict) if fields.InputDataFields.image_additional_channels in tensor_dict: channels = tensor_dict[fields.InputDataFields.image_additional_channels] tensor_dict[fields.InputDataFields.image] = tf.concat( [tensor_dict[fields.InputDataFields.image], channels], axis=2) if retain_original_image: tensor_dict[fields.InputDataFields.original_image] = tf.cast( tensor_dict[fields.InputDataFields.image], tf.uint8) # Apply data augmentation ops. if data_augmentation_fn is not None: tensor_dict = data_augmentation_fn(tensor_dict) # Apply model preprocessing ops and resize instance masks. image = tensor_dict[fields.InputDataFields.image] preprocessed_resized_image, true_image_shape = model_preprocess_fn( tf.expand_dims(tf.to_float(image), axis=0)) tensor_dict[fields.InputDataFields.image] = tf.squeeze( preprocessed_resized_image, axis=0) tensor_dict[fields.InputDataFields.true_image_shape] = tf.squeeze( true_image_shape, axis=0) if fields.InputDataFields.groundtruth_instance_masks in tensor_dict: masks = tensor_dict[fields.InputDataFields.groundtruth_instance_masks] _, resized_masks, _ = image_resizer_fn(image, masks) tensor_dict[fields.InputDataFields. groundtruth_instance_masks] = resized_masks # Transform groundtruth classes to one hot encodings. label_offset = 1 zero_indexed_groundtruth_classes = tensor_dict[ fields.InputDataFields.groundtruth_classes] - label_offset tensor_dict[fields.InputDataFields.groundtruth_classes] = tf.one_hot( zero_indexed_groundtruth_classes, num_classes) if merge_multiple_boxes: merged_boxes, merged_classes, _ = util_ops.merge_boxes_with_multiple_labels( tensor_dict[fields.InputDataFields.groundtruth_boxes], zero_indexed_groundtruth_classes, num_classes) merged_classes = tf.cast(merged_classes, tf.float32) tensor_dict[fields.InputDataFields.groundtruth_boxes] = merged_boxes tensor_dict[fields.InputDataFields.groundtruth_classes] = merged_classes return tensor_dict
def transform_input_data(tensor_dict, model_preprocess_fn, image_resizer_fn, num_classes, data_augmentation_fn=None, merge_multiple_boxes=False, retain_original_image=False): """A single function that is responsible for all input data transformations. Data transformation functions are applied in the following order. 1. data_augmentation_fn (optional): applied on tensor_dict. 2. model_preprocess_fn: applied only on image tensor in tensor_dict. 3. image_resizer_fn: applied only on instance mask tensor in tensor_dict. 4. one_hot_encoding: applied to classes tensor in tensor_dict. 5. merge_multiple_boxes (optional): when groundtruth boxes are exactly the same they can be merged into a single box with an associated k-hot class label. Args: tensor_dict: dictionary containing input tensors keyed by fields.InputDataFields. model_preprocess_fn: model's preprocess function to apply on image tensor. This function must take in a 4-D float tensor and return a 4-D preprocess float tensor and a tensor containing the true image shape. image_resizer_fn: image resizer function to apply on groundtruth instance masks. This function must take a 4-D float tensor of image and a 4-D tensor of instances masks and return resized version of these along with the true shapes. num_classes: number of max classes to one-hot (or k-hot) encode the class labels. data_augmentation_fn: (optional) data augmentation function to apply on input `tensor_dict`. merge_multiple_boxes: (optional) whether to merge multiple groundtruth boxes and classes for a given image if the boxes are exactly the same. retain_original_image: (optional) whether to retain original image in the output dictionary. Returns: A dictionary keyed by fields.InputDataFields containing the tensors obtained after applying all the transformations. """ if retain_original_image: tensor_dict[fields.InputDataFields. original_image] = tensor_dict[fields.InputDataFields.image] # Apply data augmentation ops. if data_augmentation_fn is not None: tensor_dict = data_augmentation_fn(tensor_dict) # Apply model preprocessing ops and resize instance masks. image = tf.expand_dims( tf.to_float(tensor_dict[fields.InputDataFields.image]), axis=0) preprocessed_resized_image, true_image_shape = model_preprocess_fn(image) tensor_dict[fields.InputDataFields.image] = tf.squeeze( preprocessed_resized_image, axis=0) tensor_dict[fields.InputDataFields.true_image_shape] = tf.squeeze( true_image_shape, axis=0) if fields.InputDataFields.groundtruth_instance_masks in tensor_dict: masks = tensor_dict[fields.InputDataFields.groundtruth_instance_masks] _, resized_masks, _ = image_resizer_fn(image, masks) tensor_dict[fields.InputDataFields. groundtruth_instance_masks] = resized_masks # Transform groundtruth classes to one hot encodings. label_offset = 1 zero_indexed_groundtruth_classes = tensor_dict[ fields.InputDataFields.groundtruth_classes] - label_offset tensor_dict[fields.InputDataFields.groundtruth_classes] = tf.one_hot( zero_indexed_groundtruth_classes, num_classes) if merge_multiple_boxes: merged_boxes, merged_classes, _ = util_ops.merge_boxes_with_multiple_labels( tensor_dict[fields.InputDataFields.groundtruth_boxes], zero_indexed_groundtruth_classes, num_classes) tensor_dict[fields.InputDataFields.groundtruth_boxes] = merged_boxes tensor_dict[fields.InputDataFields.groundtruth_classes] = merged_classes return tensor_dict
def transform_input_data(tensor_dict, model_preprocess_fn, image_resizer_fn, num_classes, data_augmentation_fn=None, merge_multiple_boxes=False, retain_original_image=False, use_multiclass_scores=False): """A single function that is responsible for all input data transformations. Data transformation functions are applied in the following order. 1. If key fields.InputDataFields.image_additional_channels is present in tensor_dict, the additional channels will be merged into fields.InputDataFields.image. 2. data_augmentation_fn (optional): applied on tensor_dict. 3. model_preprocess_fn: applied only on image tensor in tensor_dict. 4. image_resizer_fn: applied on original image and instance mask tensor in tensor_dict. 5. one_hot_encoding: applied to classes tensor in tensor_dict. 6. merge_multiple_boxes (optional): when groundtruth boxes are exactly the same they can be merged into a single box with an associated k-hot class label. Args: tensor_dict: dictionary containing input tensors keyed by fields.InputDataFields. model_preprocess_fn: model's preprocess function to apply on image tensor. This function must take in a 4-D float tensor and return a 4-D preprocess float tensor and a tensor containing the true image shape. image_resizer_fn: image resizer function to apply on groundtruth instance `masks. This function must take a 3-D float tensor of an image and a 3-D tensor of instance masks and return a resized version of these along with the true shapes. num_classes: number of max classes to one-hot (or k-hot) encode the class labels. data_augmentation_fn: (optional) data augmentation function to apply on input `tensor_dict`. merge_multiple_boxes: (optional) whether to merge multiple groundtruth boxes and classes for a given image if the boxes are exactly the same. retain_original_image: (optional) whether to retain original image in the output dictionary. use_multiclass_scores: whether to use multiclass scores as class targets instead of one-hot encoding of `groundtruth_classes`. use_bfloat16: (optional) a bool, whether to use bfloat16 in training. Returns: A dictionary keyed by fields.InputDataFields containing the tensors obtained after applying all the transformations. """ # Reshape flattened multiclass scores tensor into a 2D tensor of shape # [num_boxes, num_classes]. if fields.InputDataFields.multiclass_scores in tensor_dict: tensor_dict[fields.InputDataFields.multiclass_scores] = tf.reshape( tensor_dict[fields.InputDataFields.multiclass_scores], [ tf.shape(tensor_dict[fields.InputDataFields.groundtruth_boxes])[0], num_classes ]) if fields.InputDataFields.groundtruth_boxes in tensor_dict: tensor_dict = util_ops.filter_groundtruth_with_nan_box_coordinates( tensor_dict) tensor_dict = util_ops.filter_unrecognized_classes(tensor_dict) if retain_original_image: tensor_dict[fields.InputDataFields.original_image] = tf.cast( image_resizer_fn(tensor_dict[fields.InputDataFields.image])[0], tf.uint8) if fields.InputDataFields.image_additional_channels in tensor_dict: channels = tensor_dict[fields.InputDataFields.image_additional_channels] tensor_dict[fields.InputDataFields.image] = tf.concat( [tensor_dict[fields.InputDataFields.image], channels], axis=2) # # Create gt_boxes_masks # height, width, _ = tf.unstack(tf.shape(tensor_dict[fields.InputDataFields.image])) # # image_template = tf.squeeze(tensor_dict[fields.InputDataFields.groundtruth_bel_O], axis=2) # # image_template = tensor_dict[fields.InputDataFields.groundtruth_bel_O] # label_boxes_list = tensor_dict[fields.InputDataFields.groundtruth_boxes] # print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') # # print(image_template) # print(label_boxes_list) # boxes_mask = boxes2mask(label_boxes_list) # tensor_dict[fields.InputDataFields.groundtruth_boxes_mask] = tf.stop_gradient(boxes_mask) # # Create detection masks # det_mask = tf.squeeze(tensor_dict[fields.InputDataFields.groundtruth_bel_O], axis=2) # zeros = tf.zeros_like(det_mask) # ones = tf.ones_like(det_mask) # tensor_dict[fields.InputDataFields.groundtruth_boxes_mask] = tf.stop_gradient(tf.where(det_mask > 0, ones, zeros)) # Apply data augmentation ops. if data_augmentation_fn is not None: tensor_dict = data_augmentation_fn(tensor_dict) # todo first without data augm # Apply model preprocessing ops and resize instance masks. image = tensor_dict[fields.InputDataFields.image] preprocessed_resized_image, true_image_shape = model_preprocess_fn( tf.expand_dims(tf.cast(image, dtype=tf.float32), axis=0)) tensor_dict[fields.InputDataFields.image] = tf.squeeze( preprocessed_resized_image, axis=0) tensor_dict[fields.InputDataFields.true_image_shape] = tf.squeeze( true_image_shape, axis=0) groundtruth_bel_F = tensor_dict[fields.InputDataFields.groundtruth_bel_F] groundtruth_bel_O = tensor_dict[fields.InputDataFields.groundtruth_bel_O] groundtruth_z_max_detections = tensor_dict[fields.InputDataFields.groundtruth_z_max_detections] groundtruth_z_min_observations = tensor_dict[fields.InputDataFields.groundtruth_z_min_observations] groundtruth_bel_U = tensor_dict[fields.InputDataFields.groundtruth_bel_U] groundtruth_z_min_detections = tensor_dict[fields.InputDataFields.groundtruth_z_min_detections] groundtruth_detections_drivingCorridor = tensor_dict[fields.InputDataFields.groundtruth_detections_drivingCorridor] groundtruth_intensity = tensor_dict[fields.InputDataFields.groundtruth_intensity] groundtruth_bel_F = tf.expand_dims(tf.squeeze(groundtruth_bel_F, axis=2), axis=0) _, resized_groundtruth_bel_F, _ = image_resizer_fn(image, groundtruth_bel_F) # resized_groundtruth_bel_F = image_resizer_fn(groundtruth_bel_F) groundtruth_bel_O = tf.expand_dims(tf.squeeze(groundtruth_bel_O, axis=2), axis=0) _, resized_groundtruth_bel_O, _ = image_resizer_fn(image, groundtruth_bel_O) # resized_groundtruth_bel_O = image_resizer_fn(groundtruth_bel_O) groundtruth_z_max_detections = tf.expand_dims(tf.squeeze(groundtruth_z_max_detections, axis=2), axis=0) _, resized_groundtruth_z_max_detections, _ = image_resizer_fn(image, groundtruth_z_max_detections) groundtruth_z_min_observations = tf.expand_dims(tf.squeeze(groundtruth_z_min_observations, axis=2), axis=0) _, resized_groundtruth_z_min_observations, _ = image_resizer_fn(image, groundtruth_z_min_observations) groundtruth_bel_U = tf.expand_dims(tf.squeeze(groundtruth_bel_U, axis=2), axis=0) _, resized_groundtruth_bel_U, _ = image_resizer_fn(image, groundtruth_bel_U) groundtruth_z_min_detections = tf.expand_dims(tf.squeeze(groundtruth_z_min_detections, axis=2), axis=0) _, resized_groundtruth_z_min_detections, _ = image_resizer_fn(image, groundtruth_z_min_detections) groundtruth_detections_drivingCorridor = tf.expand_dims(tf.squeeze(groundtruth_detections_drivingCorridor, axis=2), axis=0) _, resized_groundtruth_detections_drivingCorridor, _ = image_resizer_fn(image, groundtruth_detections_drivingCorridor) groundtruth_intensity = tf.expand_dims(tf.squeeze(groundtruth_intensity, axis=2), axis=0) _, resized_groundtruth_intensity, _ = image_resizer_fn(image, groundtruth_intensity) tensor_dict[fields.InputDataFields.groundtruth_bel_F] = tf.expand_dims(tf.squeeze( resized_groundtruth_bel_F, axis=0), axis=2) tensor_dict[fields.InputDataFields.groundtruth_bel_O] = tf.expand_dims(tf.squeeze( resized_groundtruth_bel_O, axis=0), axis=2) tensor_dict[fields.InputDataFields.groundtruth_z_min_observations] = tf.expand_dims(tf.squeeze( resized_groundtruth_z_min_observations, axis=0), axis=2) tensor_dict[fields.InputDataFields.groundtruth_z_max_detections] = tf.expand_dims(tf.squeeze( resized_groundtruth_z_max_detections, axis=0), axis=2) tensor_dict[fields.InputDataFields.groundtruth_bel_U] = tf.expand_dims(tf.squeeze( resized_groundtruth_bel_U, axis=0), axis=2) tensor_dict[fields.InputDataFields.groundtruth_detections_drivingCorridor] = tf.expand_dims(tf.squeeze( resized_groundtruth_detections_drivingCorridor, axis=0), axis=2) tensor_dict[fields.InputDataFields.groundtruth_z_min_detections] = tf.expand_dims(tf.squeeze( resized_groundtruth_z_min_detections, axis=0), axis=2) tensor_dict[fields.InputDataFields.groundtruth_intensity] = tf.expand_dims(tf.squeeze( resized_groundtruth_intensity, axis=0), axis=2) # Transform groundtruth classes to one hot encodings. label_offset = 1 zero_indexed_groundtruth_classes = tensor_dict[ fields.InputDataFields.groundtruth_classes] - label_offset tensor_dict[fields.InputDataFields.groundtruth_classes] = tf.one_hot( zero_indexed_groundtruth_classes, num_classes) if use_multiclass_scores: tensor_dict[fields.InputDataFields.groundtruth_classes] = tensor_dict[ fields.InputDataFields.multiclass_scores] tensor_dict.pop(fields.InputDataFields.multiclass_scores, None) if fields.InputDataFields.groundtruth_confidences in tensor_dict: groundtruth_confidences = tensor_dict[ fields.InputDataFields.groundtruth_confidences] # Map the confidences to the one-hot encoding of classes tensor_dict[fields.InputDataFields.groundtruth_confidences] = ( tf.reshape(groundtruth_confidences, [-1, 1]) * tensor_dict[fields.InputDataFields.groundtruth_classes]) else: groundtruth_confidences = tf.ones_like( zero_indexed_groundtruth_classes, dtype=tf.float32) tensor_dict[fields.InputDataFields.groundtruth_confidences] = ( tensor_dict[fields.InputDataFields.groundtruth_classes]) if merge_multiple_boxes: merged_boxes, merged_classes, merged_confidences, _ = ( util_ops.merge_boxes_with_multiple_labels( tensor_dict[fields.InputDataFields.groundtruth_boxes], zero_indexed_groundtruth_classes, groundtruth_confidences, num_classes)) merged_classes = tf.cast(merged_classes, tf.float32) tensor_dict[fields.InputDataFields.groundtruth_boxes] = merged_boxes tensor_dict[fields.InputDataFields.groundtruth_classes] = merged_classes tensor_dict[fields.InputDataFields.groundtruth_confidences] = ( merged_confidences) if fields.InputDataFields.groundtruth_boxes in tensor_dict: tensor_dict[fields.InputDataFields.num_groundtruth_boxes] = tf.shape( tensor_dict[fields.InputDataFields.groundtruth_boxes])[0] # if fields.InputDataFields.groundtruth_bel_F in tensor_dict: # channels = tensor_dict[fields.InputDataFields.groundtruth_bel_F] # tensor_dict[fields.InputDataFields.groundtruth_bel_F] = tf.concat( # [tensor_dict[fields.InputDataFields.groundtruth_bel_F], channels], axis=2) # """ValueError: Can't concatenate scalars (use tf.stack instead) for 'concat_10' (op: 'ConcatV2') with input shapes: [], [], [].""" # if fields.InputDataFields.groundtruth_bel_O in tensor_dict: # channels = tensor_dict[fields.InputDataFields.groundtruth_bel_O] # tensor_dict[fields.InputDataFields.groundtruth_bel_O] = tf.concat( # [tensor_dict[fields.InputDataFields.groundtruth_bel_O], channels], axis=2) # """ValueError: Can't concatenate scalars (use tf.stack instead) for 'concat_10' (op: 'ConcatV2') with input shapes: [], [], [].""" return tensor_dict
def transform_input_data(tensor_dict, model_preprocess_fn, image_resizer_fn, num_classes, data_augmentation_fn=None, merge_multiple_boxes=False, retain_original_image=False, use_multiclass_scores=False, use_bfloat16=False, retain_original_image_additional_channels=False): """A single function that is responsible for all input data transformations. Data transformation functions are applied in the following order. 1. If key fields.InputDataFields.image_additional_channels is present in tensor_dict, the additional channels will be merged into fields.InputDataFields.image. 2. data_augmentation_fn (optional): applied on tensor_dict. 3. model_preprocess_fn: applied only on image tensor in tensor_dict. 4. image_resizer_fn: applied on original image and instance mask tensor in tensor_dict. 5. one_hot_encoding: applied to classes tensor in tensor_dict. 6. merge_multiple_boxes (optional): when groundtruth boxes are exactly the same they can be merged into a single box with an associated k-hot class label. Args: tensor_dict: dictionary containing input tensors keyed by fields.InputDataFields. model_preprocess_fn: model's preprocess function to apply on image tensor. This function must take in a 4-D float tensor and return a 4-D preprocess float tensor and a tensor containing the true image shape. image_resizer_fn: image resizer function to apply on groundtruth instance `masks. This function must take a 3-D float tensor of an image and a 3-D tensor of instance masks and return a resized version of these along with the true shapes. num_classes: number of max classes to one-hot (or k-hot) encode the class labels. data_augmentation_fn: (optional) data augmentation function to apply on input `tensor_dict`. merge_multiple_boxes: (optional) whether to merge multiple groundtruth boxes and classes for a given image if the boxes are exactly the same. retain_original_image: (optional) whether to retain original image in the output dictionary. use_multiclass_scores: whether to use multiclass scores as class targets instead of one-hot encoding of `groundtruth_classes`. When this is True and multiclass_scores is empty, one-hot encoding of `groundtruth_classes` is used as a fallback. use_bfloat16: (optional) a bool, whether to use bfloat16 in training. retain_original_image_additional_channels: (optional) Whether to retain original image additional channels in the output dictionary. Returns: A dictionary keyed by fields.InputDataFields containing the tensors obtained after applying all the transformations. """ out_tensor_dict = tensor_dict.copy() if fields.InputDataFields.multiclass_scores in out_tensor_dict: out_tensor_dict[ fields.InputDataFields .multiclass_scores] = _multiclass_scores_or_one_hot_labels( out_tensor_dict[fields.InputDataFields.multiclass_scores], out_tensor_dict[fields.InputDataFields.groundtruth_boxes], out_tensor_dict[fields.InputDataFields.groundtruth_classes], num_classes) if fields.InputDataFields.groundtruth_boxes in out_tensor_dict: out_tensor_dict = util_ops.filter_groundtruth_with_nan_box_coordinates( out_tensor_dict) out_tensor_dict = util_ops.filter_unrecognized_classes(out_tensor_dict) if retain_original_image: out_tensor_dict[fields.InputDataFields.original_image] = tf.cast( image_resizer_fn(out_tensor_dict[fields.InputDataFields.image], None)[0], tf.uint8) if fields.InputDataFields.image_additional_channels in out_tensor_dict: channels = out_tensor_dict[fields.InputDataFields.image_additional_channels] out_tensor_dict[fields.InputDataFields.image] = tf.concat( [out_tensor_dict[fields.InputDataFields.image], channels], axis=2) if retain_original_image_additional_channels: out_tensor_dict[ fields.InputDataFields.image_additional_channels] = tf.cast( image_resizer_fn(channels, None)[0], tf.uint8) # Apply data augmentation ops. if data_augmentation_fn is not None: out_tensor_dict = data_augmentation_fn(out_tensor_dict) # Apply model preprocessing ops and resize instance masks. image = out_tensor_dict[fields.InputDataFields.image] preprocessed_resized_image, true_image_shape = model_preprocess_fn( tf.expand_dims(tf.cast(image, dtype=tf.float32), axis=0)) if use_bfloat16: preprocessed_resized_image = tf.cast( preprocessed_resized_image, tf.bfloat16) out_tensor_dict[fields.InputDataFields.image] = tf.squeeze( preprocessed_resized_image, axis=0) out_tensor_dict[fields.InputDataFields.true_image_shape] = tf.squeeze( true_image_shape, axis=0) if fields.InputDataFields.groundtruth_instance_masks in out_tensor_dict: masks = out_tensor_dict[fields.InputDataFields.groundtruth_instance_masks] _, resized_masks, _ = image_resizer_fn(image, masks) if use_bfloat16: resized_masks = tf.cast(resized_masks, tf.bfloat16) out_tensor_dict[ fields.InputDataFields.groundtruth_instance_masks] = resized_masks label_offset = 1 zero_indexed_groundtruth_classes = out_tensor_dict[ fields.InputDataFields.groundtruth_classes] - label_offset if use_multiclass_scores: out_tensor_dict[ fields.InputDataFields.groundtruth_classes] = out_tensor_dict[ fields.InputDataFields.multiclass_scores] else: out_tensor_dict[fields.InputDataFields.groundtruth_classes] = tf.one_hot( zero_indexed_groundtruth_classes, num_classes) out_tensor_dict.pop(fields.InputDataFields.multiclass_scores, None) if fields.InputDataFields.groundtruth_confidences in out_tensor_dict: groundtruth_confidences = out_tensor_dict[ fields.InputDataFields.groundtruth_confidences] # Map the confidences to the one-hot encoding of classes out_tensor_dict[fields.InputDataFields.groundtruth_confidences] = ( tf.reshape(groundtruth_confidences, [-1, 1]) * out_tensor_dict[fields.InputDataFields.groundtruth_classes]) else: groundtruth_confidences = tf.ones_like( zero_indexed_groundtruth_classes, dtype=tf.float32) out_tensor_dict[fields.InputDataFields.groundtruth_confidences] = ( out_tensor_dict[fields.InputDataFields.groundtruth_classes]) if merge_multiple_boxes: merged_boxes, merged_classes, merged_confidences, _ = ( util_ops.merge_boxes_with_multiple_labels( out_tensor_dict[fields.InputDataFields.groundtruth_boxes], zero_indexed_groundtruth_classes, groundtruth_confidences, num_classes)) merged_classes = tf.cast(merged_classes, tf.float32) out_tensor_dict[fields.InputDataFields.groundtruth_boxes] = merged_boxes out_tensor_dict[fields.InputDataFields.groundtruth_classes] = merged_classes out_tensor_dict[fields.InputDataFields.groundtruth_confidences] = ( merged_confidences) if fields.InputDataFields.groundtruth_boxes in out_tensor_dict: out_tensor_dict[fields.InputDataFields.num_groundtruth_boxes] = tf.shape( out_tensor_dict[fields.InputDataFields.groundtruth_boxes])[0] return out_tensor_dict
def transform_input_data(tensor_dict, model_preprocess_fn, image_resizer_fn, num_classes, data_augmentation_fn=None, merge_multiple_boxes=False, retain_original_image=False, use_multiclass_scores=False, use_bfloat16=False, retain_original_image_additional_channels=False, keypoint_type_weight=None): """A single function that is responsible for all input data transformations. Data transformation functions are applied in the following order. 1. If key fields.InputDataFields.image_additional_channels is present in tensor_dict, the additional channels will be merged into fields.InputDataFields.image. 2. data_augmentation_fn (optional): applied on tensor_dict. 3. model_preprocess_fn: applied only on image tensor in tensor_dict. 4. keypoint_type_weight (optional): If groundtruth keypoints are in the tensor dictionary, per-keypoint weights are produced. These weights are initialized by `keypoint_type_weight` (or ones if left None). Then, for all keypoints that are not visible, the weights are set to 0 (to avoid penalizing the model in a loss function). 5. image_resizer_fn: applied on original image and instance mask tensor in tensor_dict. 6. one_hot_encoding: applied to classes tensor in tensor_dict. 7. merge_multiple_boxes (optional): when groundtruth boxes are exactly the same they can be merged into a single box with an associated k-hot class label. Args: tensor_dict: dictionary containing input tensors keyed by fields.InputDataFields. model_preprocess_fn: model's preprocess function to apply on image tensor. This function must take in a 4-D float tensor and return a 4-D preprocess float tensor and a tensor containing the true image shape. image_resizer_fn: image resizer function to apply on groundtruth instance `masks. This function must take a 3-D float tensor of an image and a 3-D tensor of instance masks and return a resized version of these along with the true shapes. num_classes: number of max classes to one-hot (or k-hot) encode the class labels. data_augmentation_fn: (optional) data augmentation function to apply on input `tensor_dict`. merge_multiple_boxes: (optional) whether to merge multiple groundtruth boxes and classes for a given image if the boxes are exactly the same. retain_original_image: (optional) whether to retain original image in the output dictionary. use_multiclass_scores: whether to use multiclass scores as class targets instead of one-hot encoding of `groundtruth_classes`. When this is True and multiclass_scores is empty, one-hot encoding of `groundtruth_classes` is used as a fallback. use_bfloat16: (optional) a bool, whether to use bfloat16 in training. retain_original_image_additional_channels: (optional) Whether to retain original image additional channels in the output dictionary. keypoint_type_weight: A list (of length num_keypoints) containing groundtruth loss weights to use for each keypoint. If None, will use a weight of 1. Returns: A dictionary keyed by fields.InputDataFields containing the tensors obtained after applying all the transformations. """ out_tensor_dict = tensor_dict.copy() if fields.InputDataFields.multiclass_scores in out_tensor_dict: out_tensor_dict[ fields.InputDataFields. multiclass_scores] = _multiclass_scores_or_one_hot_labels( out_tensor_dict[fields.InputDataFields.multiclass_scores], out_tensor_dict[fields.InputDataFields.groundtruth_boxes], out_tensor_dict[fields.InputDataFields.groundtruth_classes], num_classes) if fields.InputDataFields.groundtruth_boxes in out_tensor_dict: out_tensor_dict = util_ops.filter_groundtruth_with_nan_box_coordinates( out_tensor_dict) out_tensor_dict = util_ops.filter_unrecognized_classes(out_tensor_dict) if retain_original_image: out_tensor_dict[fields.InputDataFields.original_image] = tf.cast( image_resizer_fn(out_tensor_dict[fields.InputDataFields.image], None)[0], tf.uint8) if fields.InputDataFields.image_additional_channels in out_tensor_dict: channels = out_tensor_dict[ fields.InputDataFields.image_additional_channels] out_tensor_dict[fields.InputDataFields.image] = tf.concat( [out_tensor_dict[fields.InputDataFields.image], channels], axis=2) if retain_original_image_additional_channels: out_tensor_dict[ fields.InputDataFields.image_additional_channels] = tf.cast( image_resizer_fn(channels, None)[0], tf.uint8) # Apply data augmentation ops. if data_augmentation_fn is not None: out_tensor_dict = data_augmentation_fn(out_tensor_dict) # Apply model preprocessing ops and resize instance masks. image = out_tensor_dict[fields.InputDataFields.image] preprocessed_resized_image, true_image_shape = model_preprocess_fn( tf.expand_dims(tf.cast(image, dtype=tf.float32), axis=0)) preprocessed_shape = tf.shape(preprocessed_resized_image) new_height, new_width = preprocessed_shape[1], preprocessed_shape[2] im_box = tf.stack([ 0.0, 0.0, tf.to_float(new_height) / tf.to_float(true_image_shape[0, 0]), tf.to_float(new_width) / tf.to_float(true_image_shape[0, 1]) ]) if fields.InputDataFields.groundtruth_boxes in tensor_dict: bboxes = out_tensor_dict[fields.InputDataFields.groundtruth_boxes] boxlist = box_list.BoxList(bboxes) realigned_bboxes = box_list_ops.change_coordinate_frame( boxlist, im_box) out_tensor_dict[ fields.InputDataFields.groundtruth_boxes] = realigned_bboxes.get() if fields.InputDataFields.groundtruth_keypoints in tensor_dict: keypoints = out_tensor_dict[ fields.InputDataFields.groundtruth_keypoints] realigned_keypoints = keypoint_ops.change_coordinate_frame( keypoints, im_box) out_tensor_dict[ fields.InputDataFields.groundtruth_keypoints] = realigned_keypoints flds_gt_kpt = fields.InputDataFields.groundtruth_keypoints flds_gt_kpt_vis = fields.InputDataFields.groundtruth_keypoint_visibilities flds_gt_kpt_weights = fields.InputDataFields.groundtruth_keypoint_weights if flds_gt_kpt_vis not in out_tensor_dict: out_tensor_dict[flds_gt_kpt_vis] = tf.ones_like( out_tensor_dict[flds_gt_kpt][:, :, 0], dtype=tf.bool) out_tensor_dict[flds_gt_kpt_weights] = ( keypoint_ops.keypoint_weights_from_visibilities( out_tensor_dict[flds_gt_kpt_vis], keypoint_type_weight)) if use_bfloat16: preprocessed_resized_image = tf.cast(preprocessed_resized_image, tf.bfloat16) out_tensor_dict[fields.InputDataFields.image] = tf.squeeze( preprocessed_resized_image, axis=0) out_tensor_dict[fields.InputDataFields.true_image_shape] = tf.squeeze( true_image_shape, axis=0) if fields.InputDataFields.groundtruth_instance_masks in out_tensor_dict: masks = out_tensor_dict[ fields.InputDataFields.groundtruth_instance_masks] _, resized_masks, _ = image_resizer_fn(image, masks) if use_bfloat16: resized_masks = tf.cast(resized_masks, tf.bfloat16) out_tensor_dict[ fields.InputDataFields.groundtruth_instance_masks] = resized_masks label_offset = 1 zero_indexed_groundtruth_classes = out_tensor_dict[ fields.InputDataFields.groundtruth_classes] - label_offset if use_multiclass_scores: out_tensor_dict[ fields.InputDataFields.groundtruth_classes] = out_tensor_dict[ fields.InputDataFields.multiclass_scores] else: out_tensor_dict[ fields.InputDataFields.groundtruth_classes] = tf.one_hot( zero_indexed_groundtruth_classes, num_classes) out_tensor_dict.pop(fields.InputDataFields.multiclass_scores, None) if fields.InputDataFields.groundtruth_confidences in out_tensor_dict: groundtruth_confidences = out_tensor_dict[ fields.InputDataFields.groundtruth_confidences] # Map the confidences to the one-hot encoding of classes out_tensor_dict[fields.InputDataFields.groundtruth_confidences] = ( tf.reshape(groundtruth_confidences, [-1, 1]) * out_tensor_dict[fields.InputDataFields.groundtruth_classes]) else: groundtruth_confidences = tf.ones_like( zero_indexed_groundtruth_classes, dtype=tf.float32) out_tensor_dict[fields.InputDataFields.groundtruth_confidences] = ( out_tensor_dict[fields.InputDataFields.groundtruth_classes]) if merge_multiple_boxes: merged_boxes, merged_classes, merged_confidences, _ = ( util_ops.merge_boxes_with_multiple_labels( out_tensor_dict[fields.InputDataFields.groundtruth_boxes], zero_indexed_groundtruth_classes, groundtruth_confidences, num_classes)) merged_classes = tf.cast(merged_classes, tf.float32) out_tensor_dict[ fields.InputDataFields.groundtruth_boxes] = merged_boxes out_tensor_dict[ fields.InputDataFields.groundtruth_classes] = merged_classes out_tensor_dict[fields.InputDataFields.groundtruth_confidences] = ( merged_confidences) if fields.InputDataFields.groundtruth_boxes in out_tensor_dict: out_tensor_dict[ fields.InputDataFields.num_groundtruth_boxes] = tf.shape( out_tensor_dict[fields.InputDataFields.groundtruth_boxes])[0] return out_tensor_dict
def transform_input_data(tensor_dict, model_preprocess_fn, image_resizer_fn, num_classes, data_augmentation_fn=None, merge_multiple_boxes=False, retain_original_image=False, use_bfloat16=False): """A single function that is responsible for all input data transformations. Data transformation functions are applied in the following order. 1. If key fields.InputDataFields.image_additional_channels is present in tensor_dict, the additional channels will be merged into fields.InputDataFields.image. 2. data_augmentation_fn (optional): applied on tensor_dict. 3. model_preprocess_fn: applied only on image tensor in tensor_dict. 4. image_resizer_fn: applied on original image and instance mask tensor in tensor_dict. 5. one_hot_encoding: applied to classes tensor in tensor_dict. 6. merge_multiple_boxes (optional): when groundtruth boxes are exactly the same they can be merged into a single box with an associated k-hot class label. Args: tensor_dict: dictionary containing input tensors keyed by fields.InputDataFields. model_preprocess_fn: model's preprocess function to apply on image tensor. This function must take in a 4-D float tensor and return a 4-D preprocess float tensor and a tensor containing the true image shape. image_resizer_fn: image resizer function to apply on groundtruth instance `masks. This function must take a 3-D float tensor of an image and a 3-D tensor of instance masks and return a resized version of these along with the true shapes. num_classes: number of max classes to one-hot (or k-hot) encode the class labels. data_augmentation_fn: (optional) data augmentation function to apply on input `tensor_dict`. merge_multiple_boxes: (optional) whether to merge multiple groundtruth boxes and classes for a given image if the boxes are exactly the same. retain_original_image: (optional) whether to retain original image in the output dictionary. use_bfloat16: (optional) a bool, whether to use bfloat16 in training. Returns: A dictionary keyed by fields.InputDataFields containing the tensors obtained after applying all the transformations. """ if fields.InputDataFields.groundtruth_boxes in tensor_dict: tensor_dict = util_ops.filter_groundtruth_with_nan_box_coordinates( tensor_dict) if fields.InputDataFields.image_additional_channels in tensor_dict: channels = tensor_dict[fields.InputDataFields.image_additional_channels] tensor_dict[fields.InputDataFields.image] = tf.concat( [tensor_dict[fields.InputDataFields.image], channels], axis=2) if retain_original_image: tensor_dict[fields.InputDataFields.original_image] = tf.cast( image_resizer_fn(tensor_dict[fields.InputDataFields.image], None)[0], tf.uint8) # Apply data augmentation ops. if data_augmentation_fn is not None: tensor_dict = data_augmentation_fn(tensor_dict) # Apply model preprocessing ops and resize instance masks. image = tensor_dict[fields.InputDataFields.image] preprocessed_resized_image, true_image_shape = model_preprocess_fn( tf.expand_dims(tf.to_float(image), axis=0)) if use_bfloat16: preprocessed_resized_image = tf.cast( preprocessed_resized_image, tf.bfloat16) tensor_dict[fields.InputDataFields.image] = tf.squeeze( preprocessed_resized_image, axis=0) tensor_dict[fields.InputDataFields.true_image_shape] = tf.squeeze( true_image_shape, axis=0) if fields.InputDataFields.groundtruth_instance_masks in tensor_dict: masks = tensor_dict[fields.InputDataFields.groundtruth_instance_masks] _, resized_masks, _ = image_resizer_fn(image, masks) if use_bfloat16: resized_masks = tf.cast(resized_masks, tf.bfloat16) tensor_dict[fields.InputDataFields. groundtruth_instance_masks] = resized_masks # Transform groundtruth classes to one hot encodings. label_offset = 1 zero_indexed_groundtruth_classes = tensor_dict[ fields.InputDataFields.groundtruth_classes] - label_offset tensor_dict[fields.InputDataFields.groundtruth_classes] = tf.one_hot( zero_indexed_groundtruth_classes, num_classes) if fields.InputDataFields.groundtruth_confidences in tensor_dict: groundtruth_confidences = tensor_dict[ fields.InputDataFields.groundtruth_confidences] tensor_dict[fields.InputDataFields.groundtruth_confidences] = ( tf.sparse_to_dense( zero_indexed_groundtruth_classes, [num_classes], groundtruth_confidences, validate_indices=False)) else: groundtruth_confidences = tf.ones_like( zero_indexed_groundtruth_classes, dtype=tf.float32) tensor_dict[fields.InputDataFields.groundtruth_confidences] = ( tensor_dict[fields.InputDataFields.groundtruth_classes]) if merge_multiple_boxes: merged_boxes, merged_classes, merged_confidences, _ = ( util_ops.merge_boxes_with_multiple_labels( tensor_dict[fields.InputDataFields.groundtruth_boxes], zero_indexed_groundtruth_classes, groundtruth_confidences, num_classes)) merged_classes = tf.cast(merged_classes, tf.float32) tensor_dict[fields.InputDataFields.groundtruth_boxes] = merged_boxes tensor_dict[fields.InputDataFields.groundtruth_classes] = merged_classes tensor_dict[fields.InputDataFields.groundtruth_confidences] = ( merged_confidences) if fields.InputDataFields.groundtruth_boxes in tensor_dict: tensor_dict[fields.InputDataFields.num_groundtruth_boxes] = tf.shape( tensor_dict[fields.InputDataFields.groundtruth_boxes])[0] return tensor_dict
def transform_input_data(tensor_dict, model_preprocess_fn, image_resizer_fn, num_classes, data_augmentation_fn=None, merge_multiple_boxes=False, retain_original_image=False, use_multiclass_scores=False, use_bfloat16=False): out_tensor_dict = tensor_dict.copy() if fields.InputDataFields.multiclass_scores in out_tensor_dict: out_tensor_dict[ fields.InputDataFields. multiclass_scores] = _multiclass_scores_or_one_hot_labels( out_tensor_dict[fields.InputDataFields.multiclass_scores], out_tensor_dict[fields.InputDataFields.groundtruth_boxes], out_tensor_dict[fields.InputDataFields.groundtruth_classes], num_classes) if fields.InputDataFields.groundtruth_boxes in out_tensor_dict: out_tensor_dict = util_ops.filter_groundtruth_with_nan_box_coordinates( out_tensor_dict) out_tensor_dict = util_ops.filter_unrecognized_classes(out_tensor_dict) if retain_original_image: out_tensor_dict[fields.InputDataFields.original_image] = tf.cast( image_resizer_fn(out_tensor_dict[fields.InputDataFields.image], None)[0], tf.uint8) if fields.InputDataFields.image_additional_channels in out_tensor_dict: channels = out_tensor_dict[ fields.InputDataFields.image_additional_channels] out_tensor_dict[fields.InputDataFields.image] = tf.concat( [out_tensor_dict[fields.InputDataFields.image], channels], axis=2) # Apply data augmentation ops. if data_augmentation_fn is not None: out_tensor_dict = data_augmentation_fn(out_tensor_dict) # Apply model preprocessing ops and resize instance masks. image = out_tensor_dict[fields.InputDataFields.image] preprocessed_resized_image, true_image_shape = model_preprocess_fn( tf.expand_dims(tf.cast(image, dtype=tf.float32), axis=0)) if use_bfloat16: preprocessed_resized_image = tf.cast(preprocessed_resized_image, tf.bfloat16) out_tensor_dict[fields.InputDataFields.image] = tf.squeeze( preprocessed_resized_image, axis=0) out_tensor_dict[fields.InputDataFields.true_image_shape] = tf.squeeze( true_image_shape, axis=0) if fields.InputDataFields.groundtruth_instance_masks in out_tensor_dict: masks = out_tensor_dict[ fields.InputDataFields.groundtruth_instance_masks] _, resized_masks, _ = image_resizer_fn(image, masks) if use_bfloat16: resized_masks = tf.cast(resized_masks, tf.bfloat16) out_tensor_dict[ fields.InputDataFields.groundtruth_instance_masks] = resized_masks label_offset = 1 zero_indexed_groundtruth_classes = out_tensor_dict[ fields.InputDataFields.groundtruth_classes] - label_offset if use_multiclass_scores: out_tensor_dict[ fields.InputDataFields.groundtruth_classes] = out_tensor_dict[ fields.InputDataFields.multiclass_scores] else: out_tensor_dict[ fields.InputDataFields.groundtruth_classes] = tf.one_hot( zero_indexed_groundtruth_classes, num_classes) out_tensor_dict.pop(fields.InputDataFields.multiclass_scores, None) if fields.InputDataFields.groundtruth_confidences in out_tensor_dict: groundtruth_confidences = out_tensor_dict[ fields.InputDataFields.groundtruth_confidences] # Map the confidences to the one-hot encoding of classes out_tensor_dict[fields.InputDataFields.groundtruth_confidences] = ( tf.reshape(groundtruth_confidences, [-1, 1]) * out_tensor_dict[fields.InputDataFields.groundtruth_classes]) else: groundtruth_confidences = tf.ones_like( zero_indexed_groundtruth_classes, dtype=tf.float32) out_tensor_dict[fields.InputDataFields.groundtruth_confidences] = ( out_tensor_dict[fields.InputDataFields.groundtruth_classes]) if merge_multiple_boxes: merged_boxes, merged_classes, merged_confidences, _ = ( util_ops.merge_boxes_with_multiple_labels( out_tensor_dict[fields.InputDataFields.groundtruth_boxes], zero_indexed_groundtruth_classes, groundtruth_confidences, num_classes)) merged_classes = tf.cast(merged_classes, tf.float32) out_tensor_dict[ fields.InputDataFields.groundtruth_boxes] = merged_boxes out_tensor_dict[ fields.InputDataFields.groundtruth_classes] = merged_classes out_tensor_dict[fields.InputDataFields.groundtruth_confidences] = ( merged_confidences) if fields.InputDataFields.groundtruth_boxes in out_tensor_dict: out_tensor_dict[ fields.InputDataFields.num_groundtruth_boxes] = tf.shape( out_tensor_dict[fields.InputDataFields.groundtruth_boxes])[0] return out_tensor_dict