def _parse_single_example(self, example): """Parses a single serialized tf.Example proto. Args: example: a serialized tf.Example proto string. Returns: A dictionary of groundtruth with the following fields: source_id: a scalar tensor of int64 representing the image source_id. height: a scalar tensor of int64 representing the image height. width: a scalar tensor of int64 representing the image width. boxes: a float tensor of shape [K, 4], representing the groundtruth boxes in absolute coordinates with respect to the original image size. classes: a int64 tensor of shape [K], representing the class labels of each instances. is_crowds: a bool tensor of shape [K], indicating whether the instance is crowd. areas: a float tensor of shape [K], indicating the area of each instance. masks: a string tensor of shape [K], containing the bytes of the png mask of each instance. """ decoder = tf_example_decoder.TfExampleDecoder( include_mask=self._include_mask) decoded_tensors = decoder.decode(example) image = decoded_tensors['image'] image_size = tf.shape(image)[0:2] boxes = box_utils.denormalize_boxes( decoded_tensors['groundtruth_boxes'], image_size) groundtruths = { 'source_id': tf.string_to_number(decoded_tensors['source_id'], out_type=tf.int64), 'height': decoded_tensors['height'], 'width': decoded_tensors['width'], 'num_detections': tf.shape(decoded_tensors['groundtruth_classes'])[0], 'boxes': boxes, 'classes': decoded_tensors['groundtruth_classes'], 'is_crowds': decoded_tensors['groundtruth_is_crowd'], 'areas': decoded_tensors['groundtruth_area'], } if self._include_mask: groundtruths.update({ 'masks': decoded_tensors['groundtruth_instance_masks_png'], }) return groundtruths
def __init__(self, output_size, min_level, max_level, num_scales, aspect_ratios, anchor_size, use_category=True, outer_box_scale=1.0, box_jitter_scale=0.025, num_sampled_masks=8, mask_crop_size=32, mask_min_level=3, mask_max_level=5, upsample_factor=4, match_threshold=0.5, unmatched_threshold=0.5, aug_rand_hflip=False, aug_scale_min=1.0, aug_scale_max=1.0, skip_crowd_during_training=True, max_num_instances=100, use_bfloat16=True, mask_train_class='all', mode=None): """Initializes parameters for parsing annotations in the dataset. Args: output_size: `Tensor` or `list` for [height, width] of output image. The output_size should be divided by the largest feature stride 2^max_level. min_level: `int` number of minimum level of the output feature pyramid. max_level: `int` number of maximum level of the output feature pyramid. num_scales: `int` number representing intermediate scales added on each level. For instances, num_scales=2 adds one additional intermediate anchor scales [2^0, 2^0.5] on each level. aspect_ratios: `list` of float numbers representing the aspect raito anchors added on each level. The number indicates the ratio of width to height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each scale level. anchor_size: `float` number representing the scale of size of the base anchor to the feature stride 2^level. use_category: if `False`, treat all object in all classes in one foreground category. outer_box_scale: `float` number in a range of [1.0, inf) representing the scale from object box to outer box. The mask branch predicts instance mask enclosed in outer box. box_jitter_scale: `float` number representing the noise magnitude to jitter the training groundtruth boxes for mask branch. num_sampled_masks: `int` number of sampled masks for training. mask_crop_size: `list` for [height, width] of output training masks. mask_min_level: `int` number indicating the minimum feature level to obtain instance features. mask_max_level: `int` number indicating the maximum feature level to obtain instance features. upsample_factor: `int` factor of upsampling the fine mask predictions. match_threshold: `float` number between 0 and 1 representing the lower-bound threshold to assign positive labels for anchors. An anchor with a score over the threshold is labeled positive. unmatched_threshold: `float` number between 0 and 1 representing the upper-bound threshold to assign negative labels for anchors. An anchor with a score below the threshold is labeled negative. aug_rand_hflip: `bool`, if True, augment training with random horizontal flip. aug_scale_min: `float`, the minimum scale applied to `output_size` for data augmentation during training. aug_scale_max: `float`, the maximum scale applied to `output_size` for data augmentation during training. skip_crowd_during_training: `bool`, if True, skip annotations labeled with `is_crowd` equals to 1. max_num_instances: `int` number of maximum number of instances in an image. The groundtruth data will be padded to `max_num_instances`. use_bfloat16: `bool`, if True, cast output image to tf.bfloat16. mask_train_class: a string of experiment mode: `all`, `voc` or `nonvoc`. mode: a ModeKeys. Specifies if this is training, evaluation, prediction or prediction with groundtruths in the outputs. """ self._mode = mode self._mask_train_class = mask_train_class self._max_num_instances = max_num_instances self._skip_crowd_during_training = skip_crowd_during_training self._is_training = (mode == ModeKeys.TRAIN) self._example_decoder = tf_example_decoder.TfExampleDecoder( include_mask=True) # Anchor. self._output_size = output_size self._min_level = min_level self._max_level = max_level self._num_scales = num_scales self._aspect_ratios = aspect_ratios self._anchor_size = anchor_size self._match_threshold = match_threshold self._unmatched_threshold = unmatched_threshold # Data augmentation. self._aug_rand_hflip = aug_rand_hflip self._aug_scale_min = aug_scale_min self._aug_scale_max = aug_scale_max # Device. self._use_bfloat16 = use_bfloat16 # ShapeMask specific. # Control of which category to use. self._use_category = use_category self._num_sampled_masks = num_sampled_masks self._mask_crop_size = mask_crop_size self._mask_min_level = mask_min_level self._mask_max_level = mask_max_level self._outer_box_scale = outer_box_scale self._box_jitter_scale = box_jitter_scale self._up_sample_factor = upsample_factor # Data is parsed depending on the model Modekey. if mode == ModeKeys.TRAIN: self._parse_fn = self._parse_train_data elif mode == ModeKeys.EVAL: self._parse_fn = self._parse_eval_data elif mode == ModeKeys.PREDICT or mode == ModeKeys.PREDICT_WITH_GT: self._parse_fn = self._parse_predict_data else: raise ValueError('mode is not defined.')
def __init__(self, output_size, min_level, max_level, num_scales, aspect_ratios, anchor_size, rpn_match_threshold=0.7, rpn_unmatched_threshold=0.3, rpn_batch_size_per_im=256, rpn_fg_fraction=0.5, aug_rand_hflip=False, aug_scale_min=1.0, aug_scale_max=1.0, skip_crowd_during_training=True, max_num_instances=100, include_mask=False, mask_crop_size=112, use_bfloat16=True, mode=None): """Initializes parameters for parsing annotations in the dataset. Args: output_size: `Tensor` or `list` for [height, width] of output image. The output_size should be divided by the largest feature stride 2^max_level. min_level: `int` number of minimum level of the output feature pyramid. max_level: `int` number of maximum level of the output feature pyramid. num_scales: `int` number representing intermediate scales added on each level. For instances, num_scales=2 adds one additional intermediate anchor scales [2^0, 2^0.5] on each level. aspect_ratios: `list` of float numbers representing the aspect raito anchors added on each level. The number indicates the ratio of width to height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each scale level. anchor_size: `float` number representing the scale of size of the base anchor to the feature stride 2^level. rpn_match_threshold: rpn_unmatched_threshold: rpn_batch_size_per_im: rpn_fg_fraction: aug_rand_hflip: `bool`, if True, augment training with random horizontal flip. aug_scale_min: `float`, the minimum scale applied to `output_size` for data augmentation during training. aug_scale_max: `float`, the maximum scale applied to `output_size` for data augmentation during training. skip_crowd_during_training: `bool`, if True, skip annotations labeled with `is_crowd` equals to 1. max_num_instances: `int` number of maximum number of instances in an image. The groundtruth data will be padded to `max_num_instances`. include_mask: a bool to indicate whether parse mask groundtruth. mask_crop_size: the size which groundtruth mask is cropped to. use_bfloat16: `bool`, if True, cast output image to tf.bfloat16. mode: a ModeKeys. Specifies if this is training, evaluation, prediction or prediction with groundtruths in the outputs. """ self._mode = mode self._max_num_instances = max_num_instances self._skip_crowd_during_training = skip_crowd_during_training self._is_training = (mode == ModeKeys.TRAIN) self._example_decoder = tf_example_decoder.TfExampleDecoder( include_mask=include_mask) # Anchor. self._output_size = output_size self._min_level = min_level self._max_level = max_level self._num_scales = num_scales self._aspect_ratios = aspect_ratios self._anchor_size = anchor_size # Target assigning. self._rpn_match_threshold = rpn_match_threshold self._rpn_unmatched_threshold = rpn_unmatched_threshold self._rpn_batch_size_per_im = rpn_batch_size_per_im self._rpn_fg_fraction = rpn_fg_fraction # Data augmentation. self._aug_rand_hflip = aug_rand_hflip self._aug_scale_min = aug_scale_min self._aug_scale_max = aug_scale_max # Mask. self._include_mask = include_mask self._mask_crop_size = mask_crop_size # Device. self._use_bfloat16 = use_bfloat16 # Data is parsed depending on the model Modekey. if mode == ModeKeys.TRAIN: self._parse_fn = self._parse_train_data elif mode == ModeKeys.EVAL: self._parse_fn = self._parse_eval_data elif mode == ModeKeys.PREDICT or mode == ModeKeys.PREDICT_WITH_GT: self._parse_fn = self._parse_predict_data else: raise ValueError('mode is not defined.')
def __init__(self, output_size, min_level, max_level, num_scales, aspect_ratios, anchor_size, match_threshold=0.5, unmatched_threshold=0.5, aug_rand_hflip=False, aug_scale_min=1.0, aug_scale_max=1.0, use_autoaugment=False, autoaugment_policy_name='v0', skip_crowd_during_training=True, max_num_instances=100, use_bfloat16=True, mode=None): """Initializes parameters for parsing annotations in the dataset. Args: output_size: `Tensor` or `list` for [height, width] of output image. The output_size should be divided by the largest feature stride 2^max_level. min_level: `int` number of minimum level of the output feature pyramid. max_level: `int` number of maximum level of the output feature pyramid. num_scales: `int` number representing intermediate scales added on each level. For instances, num_scales=2 adds one additional intermediate anchor scales [2^0, 2^0.5] on each level. aspect_ratios: `list` of float numbers representing the aspect raito anchors added on each level. The number indicates the ratio of width to height. For instances, aspect_ratios=[1.0, 2.0, 0.5] adds three anchors on each scale level. anchor_size: `float` number representing the scale of size of the base anchor to the feature stride 2^level. match_threshold: `float` number between 0 and 1 representing the lower-bound threshold to assign positive labels for anchors. An anchor with a score over the threshold is labeled positive. unmatched_threshold: `float` number between 0 and 1 representing the upper-bound threshold to assign negative labels for anchors. An anchor with a score below the threshold is labeled negative. aug_rand_hflip: `bool`, if True, augment training with random horizontal flip. aug_scale_min: `float`, the minimum scale applied to `output_size` for data augmentation during training. aug_scale_max: `float`, the maximum scale applied to `output_size` for data augmentation during training. use_autoaugment: `bool`, if True, use the AutoAugment augmentation policy during training. autoaugment_policy_name: `string` that specifies the name of the AutoAugment policy that will be used during training. skip_crowd_during_training: `bool`, if True, skip annotations labeled with `is_crowd` equals to 1. max_num_instances: `int` number of maximum number of instances in an image. The groundtruth data will be padded to `max_num_instances`. use_bfloat16: `bool`, if True, cast output image to tf.bfloat16. mode: a ModeKeys. Specifies if this is training, evaluation, prediction or prediction with groundtruths in the outputs. """ self._mode = mode self._max_num_instances = max_num_instances self._skip_crowd_during_training = skip_crowd_during_training self._is_training = (mode == ModeKeys.TRAIN) self._example_decoder = tf_example_decoder.TfExampleDecoder( include_mask=False) # Anchor. self._output_size = output_size self._min_level = min_level self._max_level = max_level self._num_scales = num_scales self._aspect_ratios = aspect_ratios self._anchor_size = anchor_size self._match_threshold = match_threshold self._unmatched_threshold = unmatched_threshold # Data augmentation. self._aug_rand_hflip = aug_rand_hflip self._aug_scale_min = aug_scale_min self._aug_scale_max = aug_scale_max # Data Augmentation with AutoAugment. self._use_autoaugment = use_autoaugment self._autoaugment_policy_name = autoaugment_policy_name # Device. self._use_bfloat16 = use_bfloat16 # Data is parsed depending on the model Modekey. if mode == ModeKeys.TRAIN: self._parse_fn = self._parse_train_data elif mode == ModeKeys.EVAL: self._parse_fn = self._parse_eval_data elif mode == ModeKeys.PREDICT or mode == ModeKeys.PREDICT_WITH_GT: self._parse_fn = self._parse_predict_data else: raise ValueError('mode is not defined.')