def det_post_process(params: Dict[Any, Any], cls_outputs: Dict[int, tf.Tensor], box_outputs: Dict[int, tf.Tensor], scales: List[float], min_score_thresh, max_boxes_to_draw): """Post preprocessing the box/class predictions. Args: params: a parameter dictionary that includes `min_level`, `max_level`, `batch_size`, and `num_classes`. cls_outputs: an OrderDict with keys representing levels and values representing logits in [batch_size, height, width, num_anchors]. box_outputs: an OrderDict with keys representing levels and values representing box regression targets in [batch_size, height, width, num_anchors * 4]. scales: a list of float values indicating image scale. min_score_thresh: A float representing the threshold for deciding when to remove boxes based on score. max_boxes_to_draw: Max number of boxes to draw. Returns: detections_batch: a batch of detection results. Each detection is a tensor with each row representing [image_id, x, y, width, height, score, class]. """ # TODO(tanmingxing): refactor the code to make it more explicity. outputs = { 'cls_outputs_all': [None], 'box_outputs_all': [None], 'indices_all': [None], 'classes_all': [None] } det_model_fn.add_metric_fn_inputs( params, cls_outputs, box_outputs, outputs, -1) # Create anchor_label for picking top-k predictions. eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) # Add all detections for each input image. detections_batch = [] for index in range(params['batch_size']): cls_outputs_per_sample = outputs['cls_outputs_all'][index] box_outputs_per_sample = outputs['box_outputs_all'][index] indices_per_sample = outputs['indices_all'][index] classes_per_sample = outputs['classes_all'][index] detections = anchor_labeler.generate_detections( cls_outputs_per_sample, box_outputs_per_sample, indices_per_sample, classes_per_sample, image_id=[index], image_scale=[scales[index]], min_score_thresh=min_score_thresh, max_boxes_to_draw=max_boxes_to_draw, disable_pyfun=params.get('disable_pyfun')) detections_batch.append(detections) return tf.stack(detections_batch, name='detections')
def metric_fn(**kwargs): """Returns a dictionary that has the evaluation metrics.""" batch_size = params['batch_size'] eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) if params.get('testdev_dir', None): logging.info('Eval testdev_dir %s', params['testdev_dir']) coco_metrics = coco_metric_fn( batch_size, anchor_labeler, params['val_json_file'], testdev_dir=params['testdev_dir'], disable_pyfun=params.get('disable_pyfun', None), **kwargs) else: logging.info('Eval val with groudtruths %s.', params['val_json_file']) coco_metrics = coco_metric_fn(batch_size, anchor_labeler, params['val_json_file'], **kwargs) # Add metrics to output. output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics
def metric_fn(**kwargs): """Evaluation metric fn. Performed on CPU, do not reference TPU ops.""" eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) # add metrics to output cls_outputs = {} box_outputs = {} for level in range(params['min_level'], params['max_level'] + 1): cls_outputs[level] = kwargs['cls_outputs_%d' % level] box_outputs[level] = kwargs['box_outputs_%d' % level] detections = anchor_labeler.generate_detections( cls_outputs, box_outputs, kwargs['source_ids']) eval_metric = coco_metric.EvaluationMetric(params['val_json_file']) coco_metrics = eval_metric.estimator_metric_fn(detections, kwargs['image_scales']) # Add metrics to output. output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics
def get_pred_results(cls_outputs_dict,box_outputs_dict, params): input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], (params['image_size'] - 5)) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) return tf.map_fn(anchor_labeler.generate_detections,(cls_outputs_dict,box_outputs_dict),dtype=tf.float32)
def __init__(self, params): self._max_num_instances = MAX_NUM_INSTANCES self._image_size = params["image_size"] self._num_classes = params["num_classes"] input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], (params['image_size'] - 5)) self.anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes'])
def metric_fn(**kwargs): """Evaluation metric fn. Performed on CPU, do not reference TPU ops.""" eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) # add metrics to output cls_outputs = {} box_outputs = {} detections_bs = [] for index in range(batch_size): for level in range(params['min_level'], params['max_level'] + 1): _, w, h, c = kwargs['cls_outputs_%d' % level].get_shape().as_list() cls_outputs[level] = tf.slice( kwargs['cls_outputs_%d' % level], [index, 0, 0, 0], [1, w, h, c]) _, w, h, c = kwargs['box_outputs_%d' % level].get_shape().as_list() box_outputs[level] = tf.slice( kwargs['box_outputs_%d' % level], [index, 0, 0, 0], [1, w, h, c]) detections = anchor_labeler.generate_detections( cls_outputs, box_outputs, tf.slice(kwargs['source_ids'], [index], [1]), tf.slice(kwargs['image_scales'], [index], [1])) detections_bs.append(detections) eval_metric = coco_metric.EvaluationMetric(params['val_json_file']) coco_metrics = eval_metric.estimator_metric_fn( detections_bs, kwargs['groundtruth_data']) # Add metrics to output. output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics
def metric_fn(**kwargs): """Returns a dictionary that has the evaluation metrics.""" batch_size = params['batch_size'] eval_anchors = anchors.Anchors( params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) coco_metrics = coco_metric_fn(batch_size, anchor_labeler, params['val_json_file'], **kwargs) # Add metrics to output. output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics
def det_post_process(params: Dict[Any, Any], cls_outputs: Dict[int, tf.Tensor], box_outputs: Dict[int, tf.Tensor], scales: List[float]): outputs = { 'cls_outputs_all': [None], 'box_outputs_all': [None], 'indices_all': [None], 'classes_all': [None] } add_metric_fn_inputs(params, cls_outputs, box_outputs, outputs) #Create anchor_label for picking top-k predictions. eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) #Add all detections for each input image. detections_batch = [] for index in range(params['batch_size']): #shape is [MAX_DETECTION_POINTS,]---->score cls_outputs_per_sample = outputs['cls_outputs_all'][index] #shape is [MAX_DETECTION_POINTS,4]---->box ---ty,tx,th,tw box_outputs_per_sample = outputs['box_outputs_all'][index] # shape is [MAX_DETECTION_POINTS,] indices_per_sample = outputs['indices_all'][index] # shape is [MAX_DETECTION_POINTS,] classes_per_sample = outputs['classes_all'][index] detections = anchor_labeler.generate_detections( cls_outputs_per_sample, box_outputs_per_sample, indices_per_sample, classes_per_sample, image_id=[index], image_scale=[scales[index]], disable_pyfun=False) detections_batch.append(detections) #shape is batch =[batch,M,7]---[image_id, x, y, width, height, score, class] return tf.stack(detections_batch, name='detections')
def __call__(self, params): image_size = (params['image_size'], params['image_size']) input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], image_size) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes'], params['rpn_positive_overlap'], params['rpn_negative_overlap'], params['rpn_batch_size_per_im'], params['rpn_fg_fraction']) example_decoder = tf_example_decoder.TfExampleDecoder( use_instance_mask=self._use_instance_mask) def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: features: a dictionary that contains the image and auxiliary information. The following describes {key: value} pairs in the dictionary. image: Image tensor that is preproessed to have normalized value and fixed dimension [image_size, image_size, 3] image_info: image information that includes the original height and width, the scale of the proccessed image to the original image, and the scaled height and width. source_ids: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. labels: a dictionary that contains auxiliary information plus (optional) labels. The following describes {key: value} pairs in the dictionary. `labels` is only for training. score_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of objectiveness score at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. gt_boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. gt_classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. cropped_gt_masks: groundtrugh masks cropped by the bounding box and resized to a fixed size determined by params['gt_mask_size'] """ with tf.name_scope('parser'): data = example_decoder.decode(value) image = data['image'] source_id = data['source_id'] source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) if self._mode == tf.estimator.ModeKeys.PREDICT: input_processor = InstanceSegmentationInputProcessor( image, image_size) input_processor.normalize_image() input_processor.set_scale_factors_to_output_size() image = input_processor.resize_and_crop_image() if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) image_info = input_processor.get_image_info() return { 'images': image, 'image_info': image_info, 'source_ids': source_id } elif self._mode == tf.estimator.ModeKeys.TRAIN: instance_masks = None if self._use_instance_mask: instance_masks = data['groundtruth_instance_masks'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if not params['use_category']: classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32) if (params['skip_crowd_during_training'] and self._mode == tf.estimator.ModeKeys.TRAIN): indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) if self._use_instance_mask: instance_masks = tf.gather_nd( instance_masks, indices) input_processor = InstanceSegmentationInputProcessor( image, image_size, boxes, classes, instance_masks) input_processor.normalize_image() if params['input_rand_hflip']: input_processor.random_horizontal_flip() input_processor.set_training_random_scale_factors( params['train_scale_min'], params['train_scale_max']) image = input_processor.resize_and_crop_image() boxes, classes = input_processor.resize_and_crop_boxes() if self._use_instance_mask: instance_masks = input_processor.resize_and_crop_masks( ) cropped_gt_masks = input_processor.crop_gt_masks( instance_masks, boxes, params['gt_mask_size'], image_size) # Assign anchors. score_targets, box_targets = anchor_labeler.label_anchors( boxes, classes) # Pad groundtruth data. image_info = input_processor.get_image_info() boxes *= image_info[2] boxes = pad_to_fixed_size(boxes, -1, [self._max_num_instances, 4]) classes = pad_to_fixed_size(classes, -1, [self._max_num_instances, 1]) # Pads cropped_gt_masks. if self._use_instance_mask: cropped_gt_masks = tf.reshape( cropped_gt_masks, [self._max_num_instances, -1]) cropped_gt_masks = pad_to_fixed_size( cropped_gt_masks, -1, [ self._max_num_instances, (params['gt_mask_size'] + 4)**2 ]) cropped_gt_masks = tf.reshape(cropped_gt_masks, [ self._max_num_instances, params['gt_mask_size'] + 4, params['gt_mask_size'] + 4 ]) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) features = {} features['images'] = image features['image_info'] = image_info features['source_ids'] = source_id labels = {} for level in range(params['min_level'], params['max_level'] + 1): labels['score_targets_%d' % level] = score_targets[level] labels['box_targets_%d' % level] = box_targets[level] labels['gt_boxes'] = boxes labels['gt_classes'] = classes if self._use_instance_mask: labels['cropped_gt_masks'] = cropped_gt_masks return (features, labels) batch_size = params['batch_size'] if 'batch_size' in params else 1 dataset = tf.data.Dataset.list_files( self._file_pattern, shuffle=(self._mode == tf.estimator.ModeKeys.TRAIN)) if self._mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.repeat() # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.contrib.data.parallel_interleave( _prefetch_dataset, cycle_length=32, sloppy=(self._mode == tf.estimator.ModeKeys.TRAIN))) if self._mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.shuffle(64) # Parse the fetched records to input tensors for model function. dataset = dataset.apply( tf.contrib.data.map_and_batch(_dataset_parser, batch_size=batch_size, num_parallel_batches=64, drop_remainder=True)) # Transposes images for TPU performance. # Given the batch size, the batch dimesion (N) goes to either the minor # ((H, W, C, N) when N > C) or the second-minor ((H, W, N, C) when N < C) # dimension. Here, we assume N is 4 or 8 and C is 3, so we use # (H, W, C, N). if (params['transpose_input'] and self._mode == tf.estimator.ModeKeys.TRAIN): def _transpose_images(features, labels): features['images'] = tf.transpose(features['images'], [1, 2, 3, 0]) return features, labels dataset = dataset.map(_transpose_images, num_parallel_calls=64) dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) if self._num_examples > 0: dataset = dataset.take(self._num_examples) if self._use_fake_data: # Turn this dataset into a semi-fake dataset which always loop at the # first batch. This reduces variance in performance and is useful in # testing. dataset = dataset.take(1).cache().repeat() return dataset
def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: features: a dictionary that contains the image and auxiliary information. The following describes {key: value} pairs in the dictionary. image: Image tensor that is preproessed to have normalized value and fixed dimension [image_size, image_size, 3] image_info: image information that includes the original height and width, the scale of the proccessed image to the original image, and the scaled height and width. source_ids: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. labels: a dictionary that contains auxiliary information plus (optional) labels. The following describes {key: value} pairs in the dictionary. `labels` is only for training. score_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of objectiveness score at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. gt_boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. gt_classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. cropped_gt_masks: groundtrugh masks cropped by the bounding box and resized to a fixed size determined by params['gt_mask_size'] """ with tf.name_scope('parser'): data = example_decoder.decode(value) data['groundtruth_is_crowd'] = tf.cond( tf.greater(tf.size(data['groundtruth_is_crowd']), 0), lambda: data['groundtruth_is_crowd'], lambda: tf.zeros_like(data['groundtruth_classes'], dtype=tf.bool)) image = data['image'] image = tf.image.convert_image_dtype(image, dtype=tf.float32) orig_image = image source_id = data['source_id'] source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) if self._mode == tf.estimator.ModeKeys.PREDICT: image = preprocess_ops.normalize_image(image) image, image_info, _, _, _ = preprocess_ops.resize_crop_pad( image, params['image_size'], 2**params['max_level']) if params['precision'] == 'bfloat16': image = tf.cast(image, dtype=tf.bfloat16) features = { 'images': image, 'image_info': image_info, 'source_ids': source_id, } if params['visualize_images_summary']: resized_image = tf.image.resize_images( orig_image, params['image_size']) features['orig_images'] = resized_image if params['include_groundtruth_in_features']: labels = _prepare_labels_for_eval( data, target_num_instances=self._max_num_instances, target_polygon_list_len=self. _max_num_polygon_list_len, use_instance_mask=params['include_mask']) return {'features': features, 'labels': labels} else: return {'features': features} elif (self._mode == tf.estimator.ModeKeys.TRAIN or self._mode == tf.estimator.ModeKeys.EVAL): instance_masks = None if self._use_instance_mask: instance_masks = data['groundtruth_instance_masks'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if not params['use_category']: classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32) if (params['skip_crowd_during_training'] and self._mode == tf.estimator.ModeKeys.TRAIN): indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) if self._use_instance_mask: instance_masks = tf.gather_nd( instance_masks, indices) image = preprocess_ops.normalize_image(image) # Random flipping for training only. if (self._mode == tf.estimator.ModeKeys.TRAIN and params['input_rand_hflip']): flipped_results = ( preprocess_ops.random_horizontal_flip( image, boxes=boxes, masks=instance_masks)) if self._use_instance_mask: image, boxes, instance_masks = flipped_results else: image, boxes = flipped_results # Scaling, jittering and padding. image, image_info, boxes, classes, cropped_gt_masks = ( preprocess_ops.resize_crop_pad( image, params['image_size'], 2**params['max_level'], aug_scale_min=params['aug_scale_min'], aug_scale_max=params['aug_scale_max'], boxes=boxes, classes=classes, masks=instance_masks, crop_mask_size=params['gt_mask_size'])) if cropped_gt_masks is not None: cropped_gt_masks = tf.pad(cropped_gt_masks, paddings=tf.constant([[ 0, 0, ], [ 2, 2, ], [2, 2]]), mode='CONSTANT', constant_values=0.) padded_height, padded_width, _ = image.get_shape().as_list( ) padded_image_size = (padded_height, padded_width) input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], padded_image_size) anchor_labeler = anchors.AnchorLabeler( input_anchors, params['num_classes'], params['rpn_positive_overlap'], params['rpn_negative_overlap'], params['rpn_batch_size_per_im'], params['rpn_fg_fraction']) # Assign anchors. score_targets, box_targets = anchor_labeler.label_anchors( boxes, classes) # Pad groundtruth data. boxes = preprocess_ops.pad_to_fixed_size( boxes, -1, [self._max_num_instances, 4]) classes = preprocess_ops.pad_to_fixed_size( classes, -1, [self._max_num_instances, 1]) # Pads cropped_gt_masks. if self._use_instance_mask: cropped_gt_masks = tf.reshape( cropped_gt_masks, tf.stack([tf.shape(cropped_gt_masks)[0], -1])) cropped_gt_masks = preprocess_ops.pad_to_fixed_size( cropped_gt_masks, -1, [ self._max_num_instances, (params['gt_mask_size'] + 4)**2 ]) cropped_gt_masks = tf.reshape(cropped_gt_masks, [ self._max_num_instances, params['gt_mask_size'] + 4, params['gt_mask_size'] + 4 ]) if params['precision'] == 'bfloat16': image = tf.cast(image, dtype=tf.bfloat16) features = { 'images': image, 'image_info': image_info, 'source_ids': source_id, } labels = {} for level in range(params['min_level'], params['max_level'] + 1): labels['score_targets_%d' % level] = score_targets[level] labels['box_targets_%d' % level] = box_targets[level] labels['gt_boxes'] = boxes labels['gt_classes'] = classes if self._use_instance_mask: labels['cropped_gt_masks'] = cropped_gt_masks return features, labels
def __call__(self, params): input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) example_decoder = tf_example_decoder.TfExampleDecoder() def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets.""" with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] # for xView dataset only; basically the original name is 122.tif and we will change it to number 122 later on. # len = tf.size(tf.string_split([data['source_id']],"")) # source_id = tf.substr(data['source_id'],0,len - 4) image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) # Handle crowd annotations. As crowd annotations are not large # instances, the model ignores them in training. if params['skip_crowd']: indices = tf.where(tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) # the image normalization is identical to Cloud TPU ResNet-50 image = tf.image.convert_image_dtype(image, dtype=tf.float32) image = _normalize_image(image) if params['input_rand_hflip']: image, boxes = preprocessor.random_horizontal_flip(image, boxes=boxes) image_original_shape = tf.shape(image) image, _ = preprocessor.resize_to_range( image, min_dimension=params['image_size'], max_dimension=params['image_size']) image_scale = tf.to_float(image_original_shape[0]) / tf.to_float( tf.shape(image)[0]) image, boxes = preprocessor.scale_boxes_to_pixel_coordinates( image, boxes, keypoints=None) image = tf.image.pad_to_bounding_box(image, 0, 0, params['image_size'], params['image_size']) (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors(boxes, classes) # # sess = tf.get_default_session() # print("source id is", sess.run(source_id)) source_id = tf.string_to_number(source_id, out_type=tf.float32) # sess = tf.get_default_session() # print("after conversion, source id is", sess.run(source_id)) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) row = (image, cls_targets, box_targets, num_positives, source_id, image_scale) return row batch_size = params['batch_size'] dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=False) dataset = dataset.shuffle(buffer_size=1024) if self._is_training: dataset = dataset.repeat() def prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.contrib.data.parallel_interleave( prefetch_dataset, cycle_length=32, sloppy=True)) dataset = dataset.shuffle(20) dataset = dataset.map(_dataset_parser, num_parallel_calls=64) dataset = dataset.prefetch(batch_size) dataset = dataset.apply( tf.contrib.data.batch_and_drop_remainder(batch_size)) dataset = dataset.prefetch(1) (images, cls_targets, box_targets, num_positives, source_ids, image_scales) = dataset.make_one_shot_iterator().get_next() labels = {} # count num_positives in a batch num_positives_batch = tf.reduce_mean(num_positives) labels['mean_num_positives'] = tf.reshape( tf.tile(tf.expand_dims(num_positives_batch, 0), [ batch_size, ]), [batch_size, 1]) for level in range(params['min_level'], params['max_level'] + 1): labels['cls_targets_%d' % level] = cls_targets[level] labels['box_targets_%d' % level] = box_targets[level] labels['source_ids'] = source_ids labels['image_scales'] = image_scales # from tensorflow.python.data.ops import dataset_ops # return dataset_ops.Dataset.zip((images, labels)) return images, labels
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None): """Model defination for the RetinaNet model based on ResNet. Args: features: the input image tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include class targets and box targets which are dense label maps. The labels are generated from get_input_fn function in data/dataloader.py mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. model: the RetinaNet model outputs class logits and box regression outputs. variable_filter_fn: the filter function that takes trainable_variables and returns the variable list after applying the filter rule. Returns: tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction. """ def _model_outputs(): return model( features, min_level=params['min_level'], max_level=params['max_level'], num_classes=params['num_classes'], num_anchors=len(params['aspect_ratios'] * params['num_scales']), resnet_depth=params['resnet_depth'], is_training_bn=params['is_training_bn']) if params['use_bfloat16']: with bfloat16.bfloat16_scope(): cls_outputs, box_outputs = _model_outputs() levels = cls_outputs.keys() for level in levels: cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32) box_outputs[level] = tf.cast(box_outputs[level], tf.float32) else: cls_outputs, box_outputs = _model_outputs() levels = cls_outputs.keys() # First check if it is in PREDICT mode. if mode == tf.estimator.ModeKeys.PREDICT: # print("entering PREDICT mode") predictions = { 'image': features, } for level in levels: predictions['cls_outputs_%d' % level] = cls_outputs[level] predictions['box_outputs_%d' % level] = box_outputs[level] eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) detections = anchor_labeler.generate_detections( cls_outputs, box_outputs,image_id=100) print("detection for image is", detections) predictions['detections'] = detections return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Load pretrained model from checkpoint. if params['resnet_checkpoint'] and mode == tf.estimator.ModeKeys.TRAIN: def scaffold_fn(): """Loads pretrained model through scaffold function.""" tf.train.init_from_checkpoint(params['resnet_checkpoint'], { '/': 'resnet%s/' % params['resnet_depth'], }) return tf.train.Scaffold() else: scaffold_fn = None # Set up training loss and learning rate. global_step = tf.train.get_global_step() learning_rate = _learning_rate_schedule( params['learning_rate'], params['lr_warmup_init'], params['lr_warmup_step'], params['lr_drop_step'], global_step) # cls_loss and box_loss are for logging. only total_loss is optimized. total_loss, cls_loss, box_loss = _detection_loss(cls_outputs, box_outputs, labels, params) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.MomentumOptimizer( learning_rate, momentum=params['momentum']) if params['use_tpu']: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) var_list = variable_filter_fn( tf.trainable_variables(), params['resnet_depth']) if variable_filter_fn else None with tf.control_dependencies(update_ops): train_op = optimizer.minimize(total_loss, global_step, var_list=var_list) else: train_op = None # Evaluation only works on GPU/CPU host and batch_size=1 eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(**kwargs): """Evaluation metric fn. Performed on CPU, do not reference TPU ops.""" eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) # add metrics to output cls_outputs = {} box_outputs = {} for level in range(params['min_level'], params['max_level'] + 1): cls_outputs[level] = kwargs['cls_outputs_%d' % level] box_outputs[level] = kwargs['box_outputs_%d' % level] detections = anchor_labeler.generate_detections( cls_outputs, box_outputs, kwargs['source_ids']) eval_metric = coco_metric.EvaluationMetric(params['val_json_file']) coco_metrics = eval_metric.estimator_metric_fn(detections, kwargs['image_scales']) # Add metrics to output. output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics batch_size = params['batch_size'] cls_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(cls_loss, 0), [ batch_size, ]), [batch_size, 1]) box_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(box_loss, 0), [ batch_size, ]), [batch_size, 1]) metric_fn_inputs = { 'cls_loss_repeat': cls_loss_repeat, 'box_loss_repeat': box_loss_repeat, 'source_ids': labels['source_ids'], 'image_scales': labels['image_scales'], } for level in range(params['min_level'], params['max_level'] + 1): metric_fn_inputs['cls_outputs_%d' % level] = cls_outputs[level] metric_fn_inputs['box_outputs_%d' % level] = box_outputs[level] eval_metrics = (metric_fn, metric_fn_inputs) return tpu_estimator.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn)
def __call__(self, params=None): if params is None: params = self._params input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) example_decoder = tf_example_decoder.TfExampleDecoder() def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: image: Image tensor that is preprocessed to have normalized value and fixed dimension [image_size, image_size, 3] cls_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of class logits at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. num_positives: Number of positive anchors in the image. source_id: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. image_scale: Scale of the processed image to the original image. boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tensor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. is_crowds: Groundtruth annotations to indicate if an annotation represents a group of instances by value {0, 1}. The tensor is padded with 0 to the fixed dimension [self._max_num_instances]. areas: Groundtruth areas annotations. The tensor is padded with -1 to the fixed dimension [self._max_num_instances]. classes: Groundtruth classes annotations. The tensor is padded with -1 to the fixed dimension [self._max_num_instances]. """ with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) areas = data['groundtruth_area'] is_crowds = data['groundtruth_is_crowd'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if params['skip_crowd_during_training'] and self._is_training: indices = tf.where(tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) # NOTE: The autoaugment method works best when used alongside the # standard horizontal flipping of images along with size jittering # and normalization. if params.get('autoaugment_policy', None) and self._is_training: from aug import autoaugment # pylint: disable=g-import-not-at-top image, boxes = autoaugment.distort_image_with_autoaugment( image, boxes, params['autoaugment_policy']) input_processor = DetectionInputProcessor( image, params['image_size'], boxes, classes) input_processor.normalize_image() if self._is_training and params['input_rand_hflip']: input_processor.random_horizontal_flip() if self._is_training: input_processor.set_training_random_scale_factors( params['train_scale_min'], params['train_scale_max']) else: input_processor.set_scale_factors_to_output_size() image = input_processor.resize_and_crop_image() boxes, classes = input_processor.resize_and_crop_boxes() # Assign anchors. (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors(boxes, classes) source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) # Pad groundtruth data for evaluation. image_scale = input_processor.image_scale_to_original boxes *= image_scale is_crowds = tf.cast(is_crowds, dtype=tf.float32) boxes = pad_to_fixed_size(boxes, -1, [self._max_num_instances, 4]) is_crowds = pad_to_fixed_size(is_crowds, 0, [self._max_num_instances, 1]) areas = pad_to_fixed_size(areas, -1, [self._max_num_instances, 1]) classes = pad_to_fixed_size(classes, -1, [self._max_num_instances, 1]) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) return (image, cls_targets, box_targets, num_positives, source_id, image_scale, boxes, is_crowds, areas, classes) dataset = tf.data.Dataset.list_files( self._file_pattern, shuffle=self._is_training) if horovod_enabled() and self._is_training: #multi card eval is not supported yet # 根据 GPU 数量做 shard 均分 dataset = dataset.shard(hvd.size(), hvd.rank()) if self._is_training: dataset = dataset.repeat() # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset cycle_length = 1 if self._is_deterministic else 32 dataset = dataset.apply( tf.data.experimental.parallel_interleave( _prefetch_dataset, cycle_length=cycle_length, sloppy=self._is_training)) if self._is_training: dataset = dataset.shuffle(64) # Parse the fetched records to input tensors for model function. num_parallel_calls = 1 if self._is_deterministic else 64 dataset = dataset.map(_dataset_parser, num_parallel_calls=num_parallel_calls) batch_size = params['batch_size'] dataset = dataset.prefetch(batch_size) dataset = dataset.batch(batch_size, drop_remainder=True) def _process_example(images, cls_targets, box_targets, num_positives, source_ids, image_scales, boxes, is_crowds, areas, classes): """Processes one batch of data.""" labels = {} # Count num_positives in a batch. num_positives_batch = tf.reduce_mean(num_positives) labels['mean_num_positives'] = tf.reshape( tf.tile(tf.expand_dims(num_positives_batch, 0), [ batch_size, ]), [batch_size, 1]) for level in range(params['min_level'], params['max_level'] + 1): labels['cls_targets_%d' % level] = cls_targets[level] labels['box_targets_%d' % level] = box_targets[level] # Concatenate groundtruth annotations to a tensor. groundtruth_data = tf.concat([boxes, is_crowds, areas, classes], axis=2) labels['source_ids'] = source_ids labels['groundtruth_data'] = groundtruth_data labels['image_scales'] = image_scales return images, labels dataset = dataset.map(_process_example) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) if self._use_fake_data: # Turn this dataset into a semi-fake dataset which always loop at the # first batch. This reduces variance in performance and is useful in # testing. dataset = dataset.take(1).cache().repeat() return dataset
def __call__(self, params): input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) example_decoder = tf_example_decoder.TfExampleDecoder() def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets.""" with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) # Handle crowd annotations. As crowd annotations are not large # instances, the model ignores them in training. if params['skip_crowd']: indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) # the image normalization is identical to Cloud TPU ResNet-50 image = tf.image.convert_image_dtype(image, dtype=tf.float32) image = _normalize_image(image) if params['input_rand_hflip']: image, boxes = preprocessor.random_horizontal_flip( image, boxes=boxes) image_original_shape = tf.shape(image) image, _ = preprocessor.resize_to_range( image, min_dimension=params['image_size'], max_dimension=params['image_size']) image_scale = tf.to_float( image_original_shape[0]) / tf.to_float(tf.shape(image)[0]) image, boxes = preprocessor.scale_boxes_to_pixel_coordinates( image, boxes, keypoints=None) image = tf.image.pad_to_bounding_box(image, 0, 0, params['image_size'], params['image_size']) (cls_targets, cls_weights, box_targets, box_weights, num_positives, num_negatives, num_ignored) = anchor_labeler.label_anchors(boxes, classes) source_id = tf.string_to_number(source_id, out_type=tf.float32) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) row = (image, cls_targets, cls_weights, box_targets, box_weights, num_positives, num_negatives, num_ignored, source_id, image_scale) return row # batch_size = params['batch_size'] batch_size = self._batch_size dataset = tf.data.Dataset.list_files(self._file_pattern) dataset = dataset.shuffle(buffer_size=1024) if self._is_training: dataset = dataset.repeat() def prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename, buffer_size=8 * 1000 * 1000) return dataset dataset = dataset.apply( tf.contrib.data.parallel_interleave(prefetch_dataset, cycle_length=1, sloppy=True)) dataset = dataset.shuffle(buffer_size=3072) dataset = dataset.map(_dataset_parser, num_parallel_calls=12) dataset = dataset.prefetch(32) dataset = dataset.apply( tf.contrib.data.batch_and_drop_remainder(batch_size)) dataset = dataset.prefetch(2) (images, cls_targets, cls_weights, box_targets, box_weights, num_positives, num_negatives, num_ignored, source_ids, image_scales) = dataset.make_one_shot_iterator().get_next() labels = {} # count num_positives in a batch num_positives_batch = tf.reduce_mean(num_positives) labels['mean_num_positives'] = tf.reshape( tf.tile(tf.expand_dims(num_positives_batch, 0), [ batch_size, ]), [batch_size, 1]) num_negatives_batch = tf.reduce_mean(num_negatives) labels['mean_num_negatives'] = tf.reshape( tf.tile(tf.expand_dims(num_negatives_batch, 0), [ batch_size, ]), [batch_size, 1]) num_ignored_batch = tf.reduce_mean(num_ignored) labels['mean_num_ignored'] = tf.reshape( tf.tile(tf.expand_dims(num_ignored_batch, 0), [batch_size]), [batch_size, 1]) for level in range(params['min_level'], params['max_level'] + 1): labels['cls_targets_%d' % level] = cls_targets[level] labels['cls_weights_%d' % level] = cls_weights[level] labels['box_targets_%d' % level] = box_targets[level] labels['box_weights_%d' % level] = box_weights[level] labels['source_ids'] = source_ids labels['image_scales'] = image_scales return images, labels
def __call__(self, params): image_size = params['dynamic_image_size'] if params[ 'dynamic_input_shapes'] else (params['image_size'], params['image_size']) input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], image_size) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes'], params['rpn_positive_overlap'], params['rpn_negative_overlap'], params['rpn_batch_size_per_im'], params['rpn_fg_fraction']) if params['dynamic_input_shapes']: height_long_side_image_size = image_size[::-1] height_long_side_input_anchors = anchors.Anchors( params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], height_long_side_image_size) height_long_side_anchor_labeler = anchors.AnchorLabeler( height_long_side_input_anchors, params['num_classes'], params['rpn_positive_overlap'], params['rpn_negative_overlap'], params['rpn_batch_size_per_im'], params['rpn_fg_fraction']) example_decoder = tf_example_decoder.TfExampleDecoder( use_instance_mask=True) def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: image: Image tensor that is preproessed to have normalized value and fixed dimension [image_size, image_size, 3] cls_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of class logits at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. num_positives: Number of positive anchors in the image. source_id: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. image_scale: Scale of the proccessed image to the original image. boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. is_crowds: Groundtruth annotations to indicate if an annotation represents a group of instances by value {0, 1}. The tennsor is padded with 0 to the fixed dimension [self._max_num_instances]. areas: Groundtruth areas annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. """ with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] image = data['image'] instance_masks = data['groundtruth_instance_masks'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) areas = data['groundtruth_area'] is_crowds = data['groundtruth_is_crowd'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if not params['use_category']: classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32) if (params['skip_crowd_during_training'] and self._mode == tf.estimator.ModeKeys.TRAIN): indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) instance_masks = tf.gather_nd(instance_masks, indices) input_processor = InstanceSegmentationInputProcessor( image, image_size, params['short_side_image_size'], params['long_side_max_image_size'], boxes, classes, instance_masks) input_processor.normalize_image() if (self._mode == tf.estimator.ModeKeys.TRAIN and params['input_rand_hflip']): input_processor.random_horizontal_flip() if self._mode == tf.estimator.ModeKeys.TRAIN: input_processor.set_training_random_scale_factors( params['train_scale_min'], params['train_scale_max']) else: input_processor.set_scale_factors_to_mlperf_reference_size( ) image = input_processor.resize_and_crop_image() boxes, classes = input_processor.resize_and_crop_boxes() instance_masks = input_processor.resize_and_crop_masks() cropped_gt_masks = input_processor.crop_gt_masks( instance_masks, boxes, params['gt_mask_size'], image_size) # Assign anchors. if params['dynamic_input_shapes']: is_height_short_side = tf.less( input_processor._scaled_height, # pylint: disable=protected-access input_processor._scaled_width) # pylint: disable=protected-access score_targets, box_targets = tf.cond( is_height_short_side, lambda: anchor_labeler.label_anchors(boxes, classes), lambda: height_long_side_anchor_labeler.label_anchors(boxes, classes)) # pylint: disable=line-too-long else: score_targets, box_targets = anchor_labeler.label_anchors( boxes, classes) source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) image_scale = input_processor.image_scale_to_original scaled_height = input_processor.get_height_length() scaled_width = input_processor.get_width_length() image_info = tf.stack([ tf.to_float(scaled_height), tf.to_float(scaled_width), image_scale, tf.to_float(input_processor.get_original_height), tf.to_float(input_processor.get_original_width), ]) # Pad groundtruth data for evaluation. boxes *= image_scale is_crowds = tf.cast(is_crowds, dtype=tf.float32) boxes = pad_to_fixed_size(boxes, -1, [self._max_num_instances, 4]) is_crowds = pad_to_fixed_size(is_crowds, 0, [self._max_num_instances, 1]) areas = pad_to_fixed_size(areas, -1, [self._max_num_instances, 1]) classes = pad_to_fixed_size(classes, -1, [self._max_num_instances, 1]) # Pads cropped_gt_masks. cropped_gt_masks = tf.reshape(cropped_gt_masks, [self._max_num_instances, -1]) cropped_gt_masks = pad_to_fixed_size( cropped_gt_masks, -1, [self._max_num_instances, (params['gt_mask_size'] + 4)**2]) cropped_gt_masks = tf.reshape(cropped_gt_masks, [ self._max_num_instances, params['gt_mask_size'] + 4, params['gt_mask_size'] + 4 ]) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) return (image, score_targets, box_targets, source_id, image_info, boxes, is_crowds, areas, classes, cropped_gt_masks) # batch_size = params['batch_size'] batch_size = params['batch_size'] if 'batch_size' in params else 1 dataset = tf.data.Dataset.list_files( self._file_pattern, shuffle=(self._mode == tf.estimator.ModeKeys.TRAIN)) if self._mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.repeat() # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.contrib.data.parallel_interleave( _prefetch_dataset, cycle_length=32, sloppy=(self._mode == tf.estimator.ModeKeys.TRAIN))) if self._mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.shuffle(64) # Parse the fetched records to input tensors for model function. dataset = dataset.map(_dataset_parser, num_parallel_calls=64) if params['dynamic_input_shapes']: def key_func(image, *args): del args return tf.cast(tf.shape(image)[0], dtype=tf.int64) def reduce_func(unused_key, dataset): return dataset.batch(batch_size, drop_remainder=True) dataset = dataset.apply( tf.contrib.data.group_by_window( key_func=key_func, reduce_func=reduce_func, window_size=params['global_batch_size'])) else: dataset = dataset.prefetch(batch_size) dataset = dataset.batch(batch_size, drop_remainder=True) def _process_example(images, score_targets, box_targets, source_ids, image_info, boxes, is_crowds, areas, classes, cropped_gt_masks): """Processes one batch of data.""" # Transposes images from (N, H, W, C)->(H, W, N, C). As batch size is # less than 8, the batch goes to the second minor dimension. if (params['transpose_input'] and self._mode == tf.estimator.ModeKeys.TRAIN): images = tf.transpose(images, [1, 2, 0, 3]) labels = {} for level in range(params['min_level'], params['max_level'] + 1): labels['score_targets_%d' % level] = score_targets[level] labels['box_targets_%d' % level] = box_targets[level] # Concatenate groundtruth annotations to a tensor. groundtruth_data = tf.concat([boxes, is_crowds, areas, classes], axis=2) labels['source_ids'] = source_ids labels['groundtruth_data'] = groundtruth_data labels['image_info'] = image_info labels['cropped_gt_masks'] = cropped_gt_masks if self._mode == tf.estimator.ModeKeys.PREDICT: features = dict(images=images, image_info=image_info, groundtruth_data=groundtruth_data, source_ids=source_ids) return features elif params['dynamic_input_shapes']: # For dynamic input shapes, we have 2 TPU programs. A tf.cond op is run # on the host side to decide which TPU program to launch. As we have # data prefetch in device side, the data for evaluating the shape needs # to sent back from device to host. Thus we retun `images` shape here # explictly to avoid copy the entire `images` back. return tf.shape(images), images, labels else: return images, labels dataset = dataset.map(_process_example) dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) return dataset
def __call__(self, params): image_size = (params['image_size'], params['image_size']) input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], image_size) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes'], params['rpn_positive_overlap'], params['rpn_negative_overlap'], params['rpn_batch_size_per_im'], params['rpn_fg_fraction']) example_decoder = tf_example_decoder.TfExampleDecoder( use_instance_mask=True) def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: image: Image tensor that is preproessed to have normalized value and fixed dimension [image_size, image_size, 3] cls_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of class logits at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. num_positives: Number of positive anchors in the image. source_id: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. image_scale: Scale of the proccessed image to the original image. boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. is_crowds: Groundtruth annotations to indicate if an annotation represents a group of instances by value {0, 1}. The tennsor is padded with 0 to the fixed dimension [self._max_num_instances]. areas: Groundtruth areas annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. """ with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] image = data['image'] instance_masks = data['groundtruth_instance_masks'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) areas = data['groundtruth_area'] is_crowds = data['groundtruth_is_crowd'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if not params['use_category']: classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32) if (params['skip_crowd_during_training'] and self._mode == tf.estimator.ModeKeys.TRAIN): indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) instance_masks = tf.gather_nd(instance_masks, indices) input_processor = InstanceSegmentationInputProcessor( image, image_size, boxes, classes, instance_masks) input_processor.normalize_image() if (self._mode == tf.estimator.ModeKeys.TRAIN and params['input_rand_hflip']): input_processor.random_horizontal_flip() if self._mode == tf.estimator.ModeKeys.TRAIN: input_processor.set_training_random_scale_factors( params['train_scale_min'], params['train_scale_max']) else: input_processor.set_scale_factors_to_output_size() image = input_processor.resize_and_crop_image() boxes, classes = input_processor.resize_and_crop_boxes() instance_masks = input_processor.resize_and_crop_masks() cropped_gt_masks = input_processor.crop_gt_masks( instance_masks, boxes, params['gt_mask_size'], image_size) # Assign anchors. score_targets, box_targets = anchor_labeler.label_anchors( boxes, classes) source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) image_scale = input_processor.image_scale_to_original scaled_height = input_processor.get_height_length() scaled_width = input_processor.get_width_length() image_info = tf.stack([ tf.to_float(scaled_height), tf.to_float(scaled_width), image_scale, tf.to_float(input_processor.get_original_height), tf.to_float(input_processor.get_original_width), ]) # Pad groundtruth data for evaluation. boxes *= image_scale is_crowds = tf.cast(is_crowds, dtype=tf.float32) boxes = pad_to_fixed_size(boxes, -1, [self._max_num_instances, 4]) is_crowds = pad_to_fixed_size(is_crowds, 0, [self._max_num_instances, 1]) areas = pad_to_fixed_size(areas, -1, [self._max_num_instances, 1]) classes = pad_to_fixed_size(classes, -1, [self._max_num_instances, 1]) # Pads cropped_gt_masks. cropped_gt_masks = tf.reshape(cropped_gt_masks, [self._max_num_instances, -1]) cropped_gt_masks = pad_to_fixed_size( cropped_gt_masks, -1, [self._max_num_instances, (params['gt_mask_size'] + 4)**2]) cropped_gt_masks = tf.reshape(cropped_gt_masks, [ self._max_num_instances, params['gt_mask_size'] + 4, params['gt_mask_size'] + 4 ]) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) return (image, score_targets, box_targets, source_id, image_info, boxes, is_crowds, areas, classes, cropped_gt_masks) # batch_size = params['batch_size'] batch_size = params['batch_size'] if 'batch_size' in params else 1 dataset = tf.data.Dataset.list_files( self._file_pattern, shuffle=(self._mode == tf.estimator.ModeKeys.TRAIN)) if self._mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.repeat() # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.contrib.data.parallel_interleave( _prefetch_dataset, cycle_length=32, sloppy=(self._mode == tf.estimator.ModeKeys.TRAIN))) if self._mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.shuffle(64) # Parse the fetched records to input tensors for model function. dataset = dataset.map(_dataset_parser, num_parallel_calls=64) dataset = dataset.prefetch(batch_size) dataset = dataset.batch(batch_size, drop_remainder=True) def _process_example(images, score_targets, box_targets, source_ids, image_info, boxes, is_crowds, areas, classes, cropped_gt_masks): """Processes one batch of data.""" # Transposes images for TPU performance. # Given the batch size, the batch dimesion (N) goes to either the minor # ((H, W, C, N) when N > C) or the second-minor ((H, W, N, C) when N < C) # dimension. Here, we assume N is 4 or 8 and C is 3, so we use # (H, W, C, N). if (params['transpose_input'] and self._mode == tf.estimator.ModeKeys.TRAIN): images = tf.transpose(images, [1, 2, 3, 0]) labels = {} for level in range(params['min_level'], params['max_level'] + 1): labels['score_targets_%d' % level] = score_targets[level] labels['box_targets_%d' % level] = box_targets[level] # Concatenate groundtruth annotations to a tensor. groundtruth_data = tf.concat([boxes, is_crowds, areas, classes], axis=2) labels['source_ids'] = source_ids labels['groundtruth_data'] = groundtruth_data labels['image_info'] = image_info labels['cropped_gt_masks'] = cropped_gt_masks if self._mode == tf.estimator.ModeKeys.PREDICT: features = dict(images=images, image_info=image_info, groundtruth_data=groundtruth_data, source_ids=source_ids) return features else: return images, labels dataset = dataset.map(_process_example) dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) if self._num_examples > 0: dataset = dataset.take(self._num_examples) if self._use_fake_data: # Turn this dataset into a semi-fake dataset which always loop at the # first batch. This reduces variance in performance and is useful in # testing. dataset = dataset.take(1).cache().repeat() return dataset
def __call__(self, params): input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) example_decoder = tf_example_decoder.TfExampleDecoder() def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: image: Image tensor that is preproessed to have normalized value and fixed dimension [image_size, image_size, 3] cls_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of class logits at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. num_positives: Number of positive anchors in the image. source_id: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. image_scale: Scale of the proccessed image to the original image. boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. is_crowds: Groundtruth annotations to indicate if an annotation represents a group of instances by value {0, 1}. The tennsor is padded with 0 to the fixed dimension [self._max_num_instances]. areas: Groundtruth areas annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. """ with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) areas = data['groundtruth_area'] is_crowds = data['groundtruth_is_crowd'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if params['skip_crowd_during_training'] and self._is_training: indices = tf.where( tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) input_processor = DetectionInputProcessor( image, params['image_size'], boxes, classes) input_processor.normalize_image() if self._is_training and params['input_rand_hflip']: input_processor.random_horizontal_flip() if self._is_training: input_processor.set_training_random_scale_factors( params['train_scale_min'], params['train_scale_max']) else: input_processor.set_scale_factors_to_output_size() image = input_processor.resize_and_crop_image() boxes, classes = input_processor.resize_and_crop_boxes() # Assign anchors. (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors(boxes, classes) source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) # Pad groundtruth data for evaluation. image_scale = input_processor.image_scale_to_original boxes *= image_scale is_crowds = tf.cast(is_crowds, dtype=tf.float32) boxes = pad_to_fixed_size(boxes, -1, [self._max_num_instances, 4]) is_crowds = pad_to_fixed_size(is_crowds, 0, [self._max_num_instances, 1]) areas = pad_to_fixed_size(areas, -1, [self._max_num_instances, 1]) classes = pad_to_fixed_size(classes, -1, [self._max_num_instances, 1]) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) return (image, cls_targets, box_targets, num_positives, source_id, image_scale, boxes, is_crowds, areas, classes) batch_size = params['batch_size'] dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=self._is_training, seed=tf.random.set_random_seed( int(time.time() * 1e9))) if self._is_training: dataset = dataset.repeat() # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.contrib.data.parallel_interleave(_prefetch_dataset, cycle_length=32, sloppy=self._is_training)) if self._is_training: dataset = dataset.shuffle(64) # Parse the fetched records to input tensors for model function. dataset = dataset.map(_dataset_parser, num_parallel_calls=64) dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) dataset = dataset.batch(batch_size, drop_remainder=True) def _process_example(images, cls_targets, box_targets, num_positives, source_ids, image_scales, boxes, is_crowds, areas, classes): """Processes one batch of data.""" labels = {} # Count num_positives in a batch. num_positives_batch = tf.reduce_mean(num_positives) labels['mean_num_positives'] = tf.reshape( tf.tile(tf.expand_dims(num_positives_batch, 0), [ batch_size, ]), [batch_size, 1]) for level in range(params['min_level'], params['max_level'] + 1): labels['cls_targets_%d' % level] = cls_targets[level] labels['box_targets_%d' % level] = box_targets[level] # Concatenate groundtruth annotations to a tensor. groundtruth_data = tf.concat([boxes, is_crowds, areas, classes], axis=2) labels['source_ids'] = source_ids labels['groundtruth_data'] = groundtruth_data labels['image_scales'] = image_scales return images, labels dataset = dataset.map(_process_example) dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) return dataset
def __call__(self, params): input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) example_decoder = tf_example_decoder.TfExampleDecoder() def get_dataset_for_mode(data_dir, is_training): """Return the location of input samples for a given mode.""" if is_training: return '%s/coco_train2017_nocrowd-*' % data_dir return '%s/coco_val2017-*' % data_dir def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets.""" with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) # the image normalization is identical to Cloud TPU ResNet-50 image = tf.image.convert_image_dtype(image, dtype=tf.float32) image = _normalize_image(image) if params['input_rand_hflip']: image, boxes = preprocessor.random_horizontal_flip( image, boxes=boxes) image_original_shape = tf.shape(image) image, _ = preprocessor.resize_to_range( image, min_dimension=params['image_size'], max_dimension=params['image_size']) image_scale = tf.to_float( image_original_shape[0]) / tf.to_float(tf.shape(image)[0]) image, boxes = preprocessor.scale_boxes_to_pixel_coordinates( image, boxes, keypoints=None) image = tf.image.pad_to_bounding_box(image, 0, 0, params['image_size'], params['image_size']) (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors(boxes, classes) source_id = tf.string_to_number(source_id, out_type=tf.float32) row = (image, cls_targets, box_targets, num_positives, source_id, image_scale) return row batch_size = params['batch_size'] data_file_pattern = get_dataset_for_mode(self._data_dir, self._is_training) dataset = tf.data.Dataset.list_files(data_file_pattern) dataset = dataset.shuffle(buffer_size=1024) if self._is_training: dataset = dataset.repeat() def prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.contrib.data.parallel_interleave(prefetch_dataset, cycle_length=32, sloppy=True)) dataset = dataset.shuffle(20) dataset = dataset.map(_dataset_parser, num_parallel_calls=64) dataset = dataset.prefetch(batch_size) dataset = dataset.apply( tf.contrib.data.batch_and_drop_remainder(batch_size)) dataset = dataset.prefetch(1) (images, cls_targets, box_targets, num_positives, source_ids, image_scales) = dataset.make_one_shot_iterator().get_next() labels = {} # count num_positives in a batch num_positives_batch = tf.reduce_mean(num_positives) labels['mean_num_positives'] = tf.reshape( tf.tile(tf.expand_dims(num_positives_batch, 0), [ batch_size, ]), [batch_size, 1]) for level in range(params['min_level'], params['max_level'] + 1): labels['cls_targets_%d' % level] = cls_targets[level] labels['box_targets_%d' % level] = box_targets[level] labels['source_ids'] = source_ids labels['image_scales'] = image_scales return images, labels
def __call__(self, params, num_examples=0): image_size = params['image_size'] input_anchors = anchors.Anchors( params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], image_size) anchor_labeler = anchors.AnchorLabeler( input_anchors, params['num_classes'], params['rpn_positive_overlap'], params['rpn_negative_overlap'], params['rpn_batch_size_per_im'], params['rpn_fg_fraction']) height_long_side_image_size = image_size[::-1] height_long_side_input_anchors = anchors.Anchors( params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], height_long_side_image_size) height_long_side_anchor_labeler = anchors.AnchorLabeler( height_long_side_input_anchors, params['num_classes'], params['rpn_positive_overlap'], params['rpn_negative_overlap'], params['rpn_batch_size_per_im'], params['rpn_fg_fraction']) example_decoder = tf_example_decoder.TfExampleDecoder( use_instance_mask=True) def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets. Args: value: A dictionary contains an image and groundtruth annotations. Returns: features: A dictionary that contains the image and auxiliary information. The following describes {key: value} pairs in the dictionary. image: An image tensor that is preprocessed to have normalized value and fixed dimension [image_size, image_size, 3] image_info: Image information that includes the original height and width, the scale of the processed image to the original image, and the scaled height and width. source_ids: Source image id. Default value -1 if the source id is empty in the groundtruth annotation. labels: (only for training) A dictionary that contains groundtruth labels. The following describes {key: value} pairs in the dictionary. score_targets_dict: An ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of objectiveness score at l-th level. box_targets_dict: An ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. gt_boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. gt_classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. cropped_gt_masks: Groundtruth masks cropped by the bounding box and resized to a fixed size determined by params['gt_mask_size'] """ with tf.name_scope('parser'): data = example_decoder.decode(value) image = data['image'] source_id = data['source_id'] source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id) source_id = tf.string_to_number(source_id) if self._mode == tf.estimator.ModeKeys.PREDICT: input_processor = InstanceSegmentationInputProcessor( image, image_size, params['short_side_image_size'], params['long_side_max_image_size']) input_processor.normalize_image() input_processor.set_scale_factors_to_mlperf_reference_size() image = input_processor.resize_and_crop_image() if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) image_info = input_processor.get_image_info() return {'images': image, 'image_info': image_info, 'source_ids': source_id} # The following part is for training. instance_masks = data['groundtruth_instance_masks'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) if not params['use_category']: classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32) if (params['skip_crowd_during_training'] and self._mode == tf.estimator.ModeKeys.TRAIN): indices = tf.where(tf.logical_not(data['groundtruth_is_crowd'])) classes = tf.gather_nd(classes, indices) boxes = tf.gather_nd(boxes, indices) instance_masks = tf.gather_nd(instance_masks, indices) input_processor = InstanceSegmentationInputProcessor( image, image_size, params['short_side_image_size'], params['long_side_max_image_size'], boxes, classes, instance_masks) input_processor.normalize_image() if params['input_rand_hflip']: input_processor.random_horizontal_flip() input_processor.set_scale_factors_to_mlperf_reference_size() image = input_processor.resize_and_crop_image() boxes, classes = input_processor.resize_and_crop_boxes() cropped_gt_masks = input_processor.crop_gt_masks( params['gt_mask_size']) image_info = input_processor.get_image_info() # Assign anchors. is_height_short_side = tf.less(image_info[3], image_info[4]) score_targets, box_targets = tf.cond( is_height_short_side, lambda: anchor_labeler.label_anchors(boxes, classes), lambda: height_long_side_anchor_labeler.label_anchors(boxes, classes)) # pylint: disable=line-too-long # Pad groundtruth data. boxes *= image_info[2] boxes = pad_to_fixed_size(boxes, -1, [self._max_num_instances, 4]) classes = pad_to_fixed_size(classes, -1, [self._max_num_instances, 1]) # Pads cropped_gt_masks. cropped_gt_masks = tf.reshape( cropped_gt_masks, [-1, (params['gt_mask_size'] + 4) ** 2]) cropped_gt_masks = pad_to_fixed_size( cropped_gt_masks, -1, [self._max_num_instances, (params['gt_mask_size'] + 4) ** 2]) cropped_gt_masks = tf.reshape( cropped_gt_masks, [self._max_num_instances, params['gt_mask_size'] + 4, params['gt_mask_size'] + 4]) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) features = {} features['images'] = image features['image_info'] = image_info features['source_ids'] = source_id labels = {} for level in range(params['min_level'], params['max_level'] + 1): labels['score_targets_%d' % level] = score_targets[level] labels['box_targets_%d' % level] = box_targets[level] labels['gt_boxes'] = boxes labels['gt_classes'] = classes labels['cropped_gt_masks'] = cropped_gt_masks return features, labels batch_size = params['batch_size'] if 'batch_size' in params else 1 dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=False) if self._mode == tf.estimator.ModeKeys.TRAIN: # shard and shuffle the image files so each shard has distinctive and # random set of images. # To improve model convergence under large number of hosts, multiple hosts # may share a same dataset shard. This allows a host to get more training # images. if 'dataset_num_shards' in params: train_actual_num_shards = int(params['dataset_num_shards'] // params['hosts_per_dataset_shard']) dataset = dataset.shard( train_actual_num_shards, int(params['dataset_shard_id'] // params['hosts_per_dataset_shard'])) dataset = dataset.shuffle(tf.to_int64(256 // train_actual_num_shards)) # Prefetch data from files. def _prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.data.experimental.parallel_interleave( _prefetch_dataset, cycle_length=32, sloppy=(self._mode == tf.estimator.ModeKeys.TRAIN))) if self._mode == tf.estimator.ModeKeys.TRAIN: # Cache the raw images and shuffle them with a resonably large buffer. dataset = dataset.cache().shuffle(params['shuffle_buffer_size']).repeat() if self._distributed_eval: dataset = dataset.shard(params['dataset_num_shards'], params['dataset_shard_id']) # Parse the fetched records to input tensors for model function. dataset = dataset.map(_dataset_parser, num_parallel_calls=64) def horizontal_image(*args): image_info = args[0]['image_info'] return tf.less(image_info[3], image_info[4]) def vertical_image(*args): return tf.logical_not(horizontal_image(*args)) # Pad dataset to the desired size and mark if the dataset is padding. # During PREDICT, if batch_size_per_shard * num_shards > 5000, the # original dataset size won't be evenly divisible by the number of shards. # Note that 5000 is the number of eval samples in COCO dataset. In this # case, the eval dataset will take (batch_per_shard * num_shards - 5000) # samples from the original dataset and mark those extra samples as # `is_padding` and the original data as `is_not_padding`. This ensures # correctness of evaluation on only 5000 samples. # Appends the dataset padding to the original dataset (only in PREDICT). if (self._mode == tf.estimator.ModeKeys.PREDICT and num_examples > params['eval_samples']): def _mark_is_padding(features): features[mask_rcnn_params.IS_PADDING] = tf.constant( True, dtype=tf.bool, shape=[1]) return features def _mark_is_not_padding(features): features[mask_rcnn_params.IS_PADDING] = tf.constant( False, dtype=tf.bool, shape=[1]) return features dataset_padding = dataset # padd equal number of horizontal and vertical images and interleave them. pad_size = int(math.ceil(num_examples - params['eval_samples'])) dataset_padding_hor = dataset_padding.filter(horizontal_image).map( _mark_is_padding).take(pad_size) dataset_padding_ver = dataset_padding.filter(vertical_image).map( _mark_is_padding).take(pad_size) interleaved_dataset_padding = tf.data.experimental.choose_from_datasets( [dataset_padding_hor, dataset_padding_ver], tf.data.Dataset.range(2).repeat(pad_size)) if self._distributed_eval: dataset = dataset.map(_mark_is_not_padding).take( int( math.ceil(params['eval_samples'] / params['dataset_num_shards']))) else: dataset = dataset.map(_mark_is_not_padding).take(params['eval_samples']) dataset = dataset.concatenate(interleaved_dataset_padding) def key_func(*args): return tf.cast(horizontal_image(*args), dtype=tf.int64) def reduce_func(unused_key, dataset): return dataset.batch(batch_size, drop_remainder=True) dataset = dataset.apply( tf.data.experimental.group_by_window( key_func=key_func, reduce_func=reduce_func, window_size=(params['batch_size'] * params['replicas_per_worker']))) dataset = dataset.map( functools.partial(self._transform_images, params), num_parallel_calls=16) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) if (self._mode == tf.estimator.ModeKeys.TRAIN and num_examples > 0): dataset = dataset.take(num_examples) # Make eval dataset repeat to get rid of eval dataset init per epoch. if self._distributed_eval: dataset = dataset.take( int(num_examples / params['dataset_num_shards'] / params['batch_size'])).cache().repeat() if self._use_fake_data: # Turn this dataset into a semi-fake dataset which always loop at the # first batch. This reduces variance in performance and is useful in # testing. dataset = dataset.take(1).cache().repeat() options = tf.data.Options() options.experimental_threading.max_intra_op_parallelism = 1 dataset = dataset.with_options(options) return dataset