def _normalize_image(image): """Normalize the image to zero mean and unit variance.""" offset = tf.constant([0.485, 0.456, 0.406]) offset = tf.expand_dims(offset, axis=0) offset = tf.expand_dims(offset, axis=0) image -= offset scale = tf.constant([0.229, 0.224, 0.225]) scale = tf.expand_dims(scale, axis=0) scale = tf.expand_dims(scale, axis=0) image /= scale return image
def normalize_image(self): """Normalize the image to zero mean and unit variance.""" # The image normalization is identical to Cloud TPU ResNet. self._image = tf.image.convert_image_dtype(self._image, dtype=tf.float32) offset = tf.constant([0.485, 0.456, 0.406]) offset = tf.expand_dims(offset, axis=0) offset = tf.expand_dims(offset, axis=0) self._image -= offset scale = tf.constant([0.229, 0.224, 0.225]) scale = tf.expand_dims(scale, axis=0) scale = tf.expand_dims(scale, axis=0) self._image /= scale
def normalize_image(self): """Normalize the image to zero mean and unit variance.""" # The image normalization is identical to Cloud TPU ResNet. self._image = tf.image.convert_image_dtype(self._image, dtype=tf.float32) offset = tf.constant([0.485, 0.456, 0.406]) offset = tf.expand_dims(offset, axis=0) offset = tf.expand_dims(offset, axis=0) self._image -= offset # This is simlar to `PIXEL_MEANS` in the reference. Reference: https://github.com/ddkang/Detectron/blob/80f329530843e66d07ca39e19901d5f3e5daf009/lib/core/config.py#L909 # pylint: disable=line-too-long scale = tf.constant([0.229, 0.224, 0.225]) scale = tf.expand_dims(scale, axis=0) scale = tf.expand_dims(scale, axis=0) self._image /= scale
def _process_example(images, cls_targets, box_targets, num_positives, source_ids, image_scales, boxes, is_crowds, areas, classes): """Processes one batch of data.""" labels = {} # Count num_positives in a batch. num_positives_batch = tf.reduce_mean(num_positives) labels['mean_num_positives'] = tf.reshape( tf.tile(tf.expand_dims(num_positives_batch, 0), [ batch_size, ]), [batch_size, 1]) for level in range(params['min_level'], params['max_level'] + 1): labels['cls_targets_%d' % level] = cls_targets[level] labels['box_targets_%d' % level] = box_targets[level] # Concatenate groundtruth annotations to a tensor. groundtruth_data = tf.concat([boxes, is_crowds, areas, classes], axis=2) labels['source_ids'] = source_ids labels['groundtruth_data'] = groundtruth_data labels['image_scales'] = image_scales return images, labels
def __call__(self, params): input_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(input_anchors, params['num_classes']) example_decoder = tf_example_decoder.TfExampleDecoder() def get_dataset_for_mode(data_dir, is_training): """Return the location of input samples for a given mode.""" if is_training: return '%s/coco_train2017_nocrowd-*' % data_dir return '%s/coco_val2017-*' % data_dir def _dataset_parser(value): """Parse data to a fixed dimension input image and learning targets.""" with tf.name_scope('parser'): data = example_decoder.decode(value) source_id = data['source_id'] image = data['image'] boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1]) # the image normalization is identical to Cloud TPU ResNet-50 image = tf.image.convert_image_dtype(image, dtype=tf.float32) image = _normalize_image(image) if params['input_rand_hflip']: image, boxes = preprocessor.random_horizontal_flip( image, boxes=boxes) image_original_shape = tf.shape(image) image, _ = preprocessor.resize_to_range( image, min_dimension=params['image_size'], max_dimension=params['image_size']) image_scale = tf.to_float( image_original_shape[0]) / tf.to_float(tf.shape(image)[0]) image, boxes = preprocessor.scale_boxes_to_pixel_coordinates( image, boxes, keypoints=None) image = tf.image.pad_to_bounding_box(image, 0, 0, params['image_size'], params['image_size']) (cls_targets, box_targets, num_positives) = anchor_labeler.label_anchors(boxes, classes) source_id = tf.string_to_number(source_id, out_type=tf.float32) row = (image, cls_targets, box_targets, num_positives, source_id, image_scale) return row batch_size = params['batch_size'] data_file_pattern = get_dataset_for_mode(self._data_dir, self._is_training) dataset = tf.data.Dataset.list_files(data_file_pattern) dataset = dataset.shuffle(buffer_size=1024) if self._is_training: dataset = dataset.repeat() def prefetch_dataset(filename): dataset = tf.data.TFRecordDataset(filename).prefetch(1) return dataset dataset = dataset.apply( tf.contrib.data.parallel_interleave(prefetch_dataset, cycle_length=32, sloppy=True)) dataset = dataset.shuffle(20) dataset = dataset.map(_dataset_parser, num_parallel_calls=64) dataset = dataset.prefetch(batch_size) dataset = dataset.apply( tf.contrib.data.batch_and_drop_remainder(batch_size)) dataset = dataset.prefetch(1) (images, cls_targets, box_targets, num_positives, source_ids, image_scales) = dataset.make_one_shot_iterator().get_next() labels = {} # count num_positives in a batch num_positives_batch = tf.reduce_mean(num_positives) labels['mean_num_positives'] = tf.reshape( tf.tile(tf.expand_dims(num_positives_batch, 0), [ batch_size, ]), [batch_size, 1]) for level in range(params['min_level'], params['max_level'] + 1): labels['cls_targets_%d' % level] = cls_targets[level] labels['box_targets_%d' % level] = box_targets[level] labels['source_ids'] = source_ids labels['image_scales'] = image_scales return images, labels
def _model_fn(features, labels, mode, params, model): """Model defination for the RetinaNet model based on ResNet-50. Args: features: the input image tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include class targets and box targets which are dense label maps. The labels are generated from get_input_fn function in data/dataloader.py mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. model: the RetinaNet model outputs class logits and box regression outputs. Returns: tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction. """ cls_outputs, box_outputs = model( features, min_level=params['min_level'], max_level=params['max_level'], num_classes=params['num_classes'], num_anchors=len(params['aspect_ratios'] * params['num_scales']), is_training_bn=params['is_training_bn']) levels = cls_outputs.keys() # First check if it is in PREDICT mode. if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'image': features, } for level in levels: predictions['cls_outputs_%d' % level] = cls_outputs[level] predictions['box_outputs_%d' % level] = box_outputs[level] return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Load pretrained model from checkpoint. if params['resnet_checkpoint'] and mode == tf.estimator.ModeKeys.TRAIN: def scaffold_fn(): """Loads pretrained model through scaffold function.""" tf.train.init_from_checkpoint(params['resnet_checkpoint'], { '/': 'resnet50/', }) return tf.train.Scaffold() else: scaffold_fn = None # Set up training loss and learning rate. global_step = tf.train.get_global_step() learning_rate = _learning_rate_schedule(params['learning_rate'], params['lr_warmup_step'], params['lr_drop_step'], global_step) # cls_loss and box_loss are for logging. only total_loss is optimized. total_loss, cls_loss, box_loss = _detection_loss(cls_outputs, box_outputs, labels, params) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.MomentumOptimizer( learning_rate, momentum=params['momentum']) if params['use_tpu']: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(total_loss, global_step) else: train_op = None # Evaluation only works on GPU/CPU host and batch_size=1 eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(**kwargs): """Evaluation metric fn. Performed on CPU, do not reference TPU ops.""" eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) # add metrics to output cls_outputs = {} box_outputs = {} for level in range(params['min_level'], params['max_level'] + 1): cls_outputs[level] = kwargs['cls_outputs_%d' % level] box_outputs[level] = kwargs['box_outputs_%d' % level] detections = anchor_labeler.generate_detections( cls_outputs, box_outputs, kwargs['source_ids']) eval_metric = coco_metric.EvaluationMetric(params['val_json_file']) coco_metrics = eval_metric.estimator_metric_fn(detections, kwargs['image_scales']) # Add metrics to output. output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics batch_size = params['batch_size'] cls_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(cls_loss, 0), [ batch_size, ]), [batch_size, 1]) box_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(box_loss, 0), [ batch_size, ]), [batch_size, 1]) metric_fn_inputs = { 'cls_loss_repeat': cls_loss_repeat, 'box_loss_repeat': box_loss_repeat, 'source_ids': labels['source_ids'], 'image_scales': labels['image_scales'], } for level in range(params['min_level'], params['max_level'] + 1): metric_fn_inputs['cls_outputs_%d' % level] = cls_outputs[level] metric_fn_inputs['box_outputs_%d' % level] = box_outputs[level] eval_metrics = (metric_fn, metric_fn_inputs) return tpu_estimator.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn)
def random_horizontal_flip(self): """Randomly flip input image and segmentation label.""" self._label = tf.expand_dims(self._label, 0) self._image, self._label = preprocessor.random_horizontal_flip( self._image, masks=self._label) self._label = self._label[0, :, :]