示例#1
0
    def compute_eval_dict(features, labels):
        """Compute the evaluation result on an image."""
        # For evaling on train data, it is necessary to check whether groundtruth
        # must be unpadded.
        boxes_shape = (labels[
            fields.InputDataFields.groundtruth_boxes].get_shape().as_list())
        unpad_groundtruth_tensors = (boxes_shape[1] is not None and not use_tpu
                                     and batch_size == 1)
        labels = model_lib.unstack_batch(
            labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)

        losses_dict, prediction_dict = _compute_losses_and_predictions_dicts(
            detection_model, features, labels, add_regularization_loss)

        def postprocess_wrapper(args):
            return detection_model.postprocess(args[0], args[1])

        # TODO(kaftan): Depending on how postprocessing will work for TPUS w/
        ## TPUStrategy, may be good to move wrapping to a utility method
        if use_tpu and postprocess_on_cpu:
            detections = contrib_tpu.outside_compilation(
                postprocess_wrapper,
                (prediction_dict,
                 features[fields.InputDataFields.true_image_shape]))
        else:
            detections = postprocess_wrapper(
                (prediction_dict,
                 features[fields.InputDataFields.true_image_shape]))

        class_agnostic = (fields.DetectionResultFields.detection_classes
                          not in detections)
        # TODO(kaftan) (or anyone): move `_prepare_groundtruth_for_eval to eval_util
        ## and call this from there.
        groundtruth = model_lib._prepare_groundtruth_for_eval(  # pylint: disable=protected-access
            detection_model, class_agnostic,
            eval_input_config.max_number_of_boxes)
        use_original_images = fields.InputDataFields.original_image in features
        if use_original_images:
            eval_images = features[fields.InputDataFields.original_image]
            true_image_shapes = tf.slice(
                features[fields.InputDataFields.true_image_shape], [0, 0],
                [-1, 3])
            original_image_spatial_shapes = features[
                fields.InputDataFields.original_image_spatial_shape]
        else:
            eval_images = features[fields.InputDataFields.image]
            true_image_shapes = None
            original_image_spatial_shapes = None

        eval_dict = eval_util.result_dict_for_batched_example(
            eval_images,
            features[inputs.HASH_KEY],
            detections,
            groundtruth,
            class_agnostic=class_agnostic,
            scale_to_absolute=True,
            original_image_spatial_shapes=original_image_spatial_shapes,
            true_image_shapes=true_image_shapes)

        return eval_dict, losses_dict, class_agnostic
示例#2
0
  def _make_evaluation_dict(self,
                            resized_groundtruth_masks=False,
                            batch_size=1,
                            max_gt_boxes=None,
                            scale_to_absolute=False):
    input_data_fields = fields.InputDataFields
    detection_fields = fields.DetectionResultFields

    image = tf.zeros(shape=[batch_size, 20, 20, 3], dtype=tf.uint8)
    if batch_size == 1:
      key = tf.constant('image1')
    else:
      key = tf.constant([str(i) for i in range(batch_size)])
    detection_boxes = tf.tile(tf.constant([[[0., 0., 1., 1.]]]),
                              multiples=[batch_size, 1, 1])
    detection_scores = tf.tile(tf.constant([[0.8]]), multiples=[batch_size, 1])
    detection_classes = tf.tile(tf.constant([[0]]), multiples=[batch_size, 1])
    detection_masks = tf.tile(tf.ones(shape=[1, 1, 20, 20], dtype=tf.float32),
                              multiples=[batch_size, 1, 1, 1])
    num_detections = tf.ones([batch_size])
    groundtruth_boxes = tf.constant([[0., 0., 1., 1.]])
    groundtruth_classes = tf.constant([1])
    groundtruth_instance_masks = tf.ones(shape=[1, 20, 20], dtype=tf.uint8)
    if resized_groundtruth_masks:
      groundtruth_instance_masks = tf.ones(shape=[1, 10, 10], dtype=tf.uint8)

    if batch_size > 1:
      groundtruth_boxes = tf.tile(tf.expand_dims(groundtruth_boxes, 0),
                                  multiples=[batch_size, 1, 1])
      groundtruth_classes = tf.tile(tf.expand_dims(groundtruth_classes, 0),
                                    multiples=[batch_size, 1])
      groundtruth_instance_masks = tf.tile(
          tf.expand_dims(groundtruth_instance_masks, 0),
          multiples=[batch_size, 1, 1, 1])

    detections = {
        detection_fields.detection_boxes: detection_boxes,
        detection_fields.detection_scores: detection_scores,
        detection_fields.detection_classes: detection_classes,
        detection_fields.detection_masks: detection_masks,
        detection_fields.num_detections: num_detections
    }
    groundtruth = {
        input_data_fields.groundtruth_boxes: groundtruth_boxes,
        input_data_fields.groundtruth_classes: groundtruth_classes,
        input_data_fields.groundtruth_instance_masks: groundtruth_instance_masks
    }
    if batch_size > 1:
      return eval_util.result_dict_for_batched_example(
          image, key, detections, groundtruth,
          scale_to_absolute=scale_to_absolute,
          max_gt_boxes=max_gt_boxes)
    else:
      return eval_util.result_dict_for_single_example(
          image, key, detections, groundtruth,
          scale_to_absolute=scale_to_absolute)
示例#3
0
        def graph_fn():
            detections = {
                detection_fields.detection_boxes:
                tf.constant(detection_boxes),
                detection_fields.detection_scores:
                tf.constant([[1.], [1.]]),
                detection_fields.detection_classes:
                tf.constant([[1], [2]]),
                detection_fields.num_detections:
                tf.constant([1, 1]),
                detection_fields.detection_keypoints:
                tf.tile(tf.reshape(tf.constant(detection_keypoints),
                                   shape=[1, 1, 3, 2]),
                        multiples=[2, 1, 1, 1])
            }

            gt_boxes = detection_boxes
            groundtruth = {
                input_data_fields.groundtruth_boxes:
                tf.constant(gt_boxes),
                input_data_fields.groundtruth_classes:
                tf.constant([[1.], [1.]]),
                input_data_fields.groundtruth_keypoints:
                tf.tile(tf.reshape(tf.constant(detection_keypoints),
                                   shape=[1, 1, 3, 2]),
                        multiples=[2, 1, 1, 1])
            }

            image = tf.zeros((2, 100, 100, 3), dtype=tf.float32)

            true_image_shapes = tf.constant([[100, 100, 3], [50, 100, 3]])
            original_image_spatial_shapes = tf.constant([[200, 200],
                                                         [150, 300]])

            result = eval_util.result_dict_for_batched_example(
                image,
                key,
                detections,
                groundtruth,
                scale_to_absolute=True,
                true_image_shapes=true_image_shapes,
                original_image_spatial_shapes=original_image_spatial_shapes,
                max_gt_boxes=tf.constant(1))
            return (result[input_data_fields.groundtruth_boxes],
                    result[input_data_fields.groundtruth_keypoints],
                    result[detection_fields.detection_boxes],
                    result[detection_fields.detection_keypoints])
示例#4
0
    def test_padded_image_result_dict(self):

        input_data_fields = fields.InputDataFields
        detection_fields = fields.DetectionResultFields
        key = tf.constant([str(i) for i in range(2)])

        detection_boxes = np.array(
            [[[0., 0., 1., 1.]], [[0.0, 0.0, 0.5, 0.5]]], dtype=np.float32)
        detections = {
            detection_fields.detection_boxes: tf.constant(detection_boxes),
            detection_fields.detection_scores: tf.constant([[1.], [1.]]),
            detection_fields.detection_classes: tf.constant([[1], [2]]),
            detection_fields.num_detections: tf.constant([1, 1])
        }

        gt_boxes = detection_boxes
        groundtruth = {
            input_data_fields.groundtruth_boxes: tf.constant(gt_boxes),
            input_data_fields.groundtruth_classes: tf.constant([[1.], [1.]]),
        }

        image = tf.zeros((2, 100, 100, 3), dtype=tf.float32)

        true_image_shapes = tf.constant([[100, 100, 3], [50, 100, 3]])
        original_image_spatial_shapes = tf.constant([[200, 200], [150, 300]])

        result = eval_util.result_dict_for_batched_example(
            image,
            key,
            detections,
            groundtruth,
            scale_to_absolute=True,
            true_image_shapes=true_image_shapes,
            original_image_spatial_shapes=original_image_spatial_shapes,
            max_gt_boxes=tf.constant(1))

        with self.test_session() as sess:
            result = sess.run(result)
            self.assertAllEqual(
                [[[0., 0., 200., 200.]], [[0.0, 0.0, 150., 150.]]],
                result[input_data_fields.groundtruth_boxes])

            # Predictions from the model are not scaled.
            self.assertAllEqual(
                [[[0., 0., 200., 200.]], [[0.0, 0.0, 75., 150.]]],
                result[detection_fields.detection_boxes])
示例#5
0
    def model_fn(features, labels, mode, params=None):
        """Constructs the object detection model.

    Args:
      features: Dictionary of feature tensors, returned from `input_fn`.
      labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL,
        otherwise None.
      mode: Mode key from tf.estimator.ModeKeys.
      params: Parameter dictionary passed from the estimator.

    Returns:
      An `EstimatorSpec` that encapsulates the model and its serving
        configurations.
    """
        params = params or {}
        total_loss, train_op, detections, export_outputs = None, None, None, None
        is_training = mode == tf.estimator.ModeKeys.TRAIN

        # Make sure to set the Keras learning phase. True during training,
        # False for inference.
        tf.keras.backend.set_learning_phase(is_training)
        # Set policy for mixed-precision training with Keras-based models.
        if use_tpu and train_config.use_bfloat16:
            from tensorflow.python.keras.engine import base_layer_utils  # pylint: disable=g-import-not-at-top
            # Enable v2 behavior, as `mixed_bfloat16` is only supported in TF 2.0.
            base_layer_utils.enable_v2_dtype_behavior()
            tf2.keras.mixed_precision.experimental.set_policy('mixed_bfloat16')
        detection_model = detection_model_fn(is_training=is_training,
                                             add_summaries=(not use_tpu))
        scaffold_fn = None

        if mode == tf.estimator.ModeKeys.TRAIN:
            labels = unstack_batch(labels,
                                   unpad_groundtruth_tensors=train_config.
                                   unpad_groundtruth_tensors)
        elif mode == tf.estimator.ModeKeys.EVAL:
            # For evaling on train data, it is necessary to check whether groundtruth
            # must be unpadded.
            boxes_shape = (labels[fields.InputDataFields.groundtruth_boxes].
                           get_shape().as_list())
            unpad_groundtruth_tensors = boxes_shape[
                1] is not None and not use_tpu
            labels = unstack_batch(
                labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)

        if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
            provide_groundtruth(detection_model, labels)

        preprocessed_images = features[fields.InputDataFields.image]

        side_inputs = detection_model.get_side_inputs(features)

        if use_tpu and train_config.use_bfloat16:
            with tf.tpu.bfloat16_scope():
                prediction_dict = detection_model.predict(
                    preprocessed_images,
                    features[fields.InputDataFields.true_image_shape],
                    **side_inputs)
                prediction_dict = ops.bfloat16_to_float32_nested(
                    prediction_dict)
        else:
            prediction_dict = detection_model.predict(
                preprocessed_images,
                features[fields.InputDataFields.true_image_shape],
                **side_inputs)

        def postprocess_wrapper(args):
            return detection_model.postprocess(args[0], args[1])

        if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT):
            if use_tpu and postprocess_on_cpu:
                detections = tf.tpu.outside_compilation(
                    postprocess_wrapper,
                    (prediction_dict,
                     features[fields.InputDataFields.true_image_shape]))
            else:
                detections = postprocess_wrapper(
                    (prediction_dict,
                     features[fields.InputDataFields.true_image_shape]))

        if mode == tf.estimator.ModeKeys.TRAIN:
            load_pretrained = hparams.load_pretrained if hparams else False
            if train_config.fine_tune_checkpoint and load_pretrained:
                if not train_config.fine_tune_checkpoint_type:
                    # train_config.from_detection_checkpoint field is deprecated. For
                    # backward compatibility, set train_config.fine_tune_checkpoint_type
                    # based on train_config.from_detection_checkpoint.
                    if train_config.from_detection_checkpoint:
                        train_config.fine_tune_checkpoint_type = 'detection'
                    else:
                        train_config.fine_tune_checkpoint_type = 'classification'
                asg_map = detection_model.restore_map(
                    fine_tune_checkpoint_type=train_config.
                    fine_tune_checkpoint_type,
                    load_all_detection_checkpoint_vars=(
                        train_config.load_all_detection_checkpoint_vars))
                available_var_map = (
                    variables_helper.get_variables_available_in_checkpoint(
                        asg_map,
                        train_config.fine_tune_checkpoint,
                        include_global_step=False))
                if use_tpu:

                    def tpu_scaffold():
                        tf.train.init_from_checkpoint(
                            train_config.fine_tune_checkpoint,
                            available_var_map)
                        return tf.train.Scaffold()

                    scaffold_fn = tpu_scaffold
                else:
                    tf.train.init_from_checkpoint(
                        train_config.fine_tune_checkpoint, available_var_map)

        if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
            if (mode == tf.estimator.ModeKeys.EVAL
                    and eval_config.use_dummy_loss_in_eval):
                total_loss = tf.constant(1.0)
                losses_dict = {'Loss/total_loss': total_loss}
            else:
                losses_dict = detection_model.loss(
                    prediction_dict,
                    features[fields.InputDataFields.true_image_shape])
                losses = [loss_tensor for loss_tensor in losses_dict.values()]
                if train_config.add_regularization_loss:
                    regularization_losses = detection_model.regularization_losses(
                    )
                    if use_tpu and train_config.use_bfloat16:
                        regularization_losses = ops.bfloat16_to_float32_nested(
                            regularization_losses)
                    if regularization_losses:
                        regularization_loss = tf.add_n(
                            regularization_losses, name='regularization_loss')
                        losses.append(regularization_loss)
                        losses_dict[
                            'Loss/regularization_loss'] = regularization_loss
                total_loss = tf.add_n(losses, name='total_loss')
                losses_dict['Loss/total_loss'] = total_loss

            if 'graph_rewriter_config' in configs:
                graph_rewriter_fn = graph_rewriter_builder.build(
                    configs['graph_rewriter_config'], is_training=is_training)
                graph_rewriter_fn()

            # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we
            # can write learning rate summaries on TPU without host calls.
            global_step = tf.train.get_or_create_global_step()
            training_optimizer, optimizer_summary_vars = optimizer_builder.build(
                train_config.optimizer)

        if mode == tf.estimator.ModeKeys.TRAIN:
            if use_tpu:
                training_optimizer = tf.tpu.CrossShardOptimizer(
                    training_optimizer)

            # Optionally freeze some layers by setting their gradients to be zero.
            trainable_variables = None
            include_variables = (train_config.update_trainable_variables
                                 if train_config.update_trainable_variables
                                 else None)
            exclude_variables = (train_config.freeze_variables
                                 if train_config.freeze_variables else None)
            trainable_variables = slim.filter_variables(
                tf.trainable_variables(),
                include_patterns=include_variables,
                exclude_patterns=exclude_variables)

            clip_gradients_value = None
            if train_config.gradient_clipping_by_norm > 0:
                clip_gradients_value = train_config.gradient_clipping_by_norm

            if not use_tpu:
                for var in optimizer_summary_vars:
                    tf.summary.scalar(var.op.name, var)
            summaries = [] if use_tpu else None
            if train_config.summarize_gradients:
                summaries = [
                    'gradients', 'gradient_norm', 'global_gradient_norm'
                ]
            train_op = slim.optimizers.optimize_loss(
                loss=total_loss,
                global_step=global_step,
                learning_rate=None,
                clip_gradients=clip_gradients_value,
                optimizer=training_optimizer,
                update_ops=detection_model.updates(),
                variables=trainable_variables,
                summaries=summaries,
                name='')  # Preventing scope prefix on all variables.

        if mode == tf.estimator.ModeKeys.PREDICT:
            exported_output = exporter_lib.add_output_tensor_nodes(detections)
            export_outputs = {
                tf.saved_model.signature_constants.PREDICT_METHOD_NAME:
                tf.estimator.export.PredictOutput(exported_output)
            }

        eval_metric_ops = None
        scaffold = None
        if mode == tf.estimator.ModeKeys.EVAL:
            class_agnostic = (fields.DetectionResultFields.detection_classes
                              not in detections)
            groundtruth = _prepare_groundtruth_for_eval(
                detection_model, class_agnostic,
                eval_input_config.max_number_of_boxes)
            use_original_images = fields.InputDataFields.original_image in features
            if use_original_images:
                eval_images = features[fields.InputDataFields.original_image]
                true_image_shapes = tf.slice(
                    features[fields.InputDataFields.true_image_shape], [0, 0],
                    [-1, 3])
                original_image_spatial_shapes = features[
                    fields.InputDataFields.original_image_spatial_shape]
            else:
                eval_images = features[fields.InputDataFields.image]
                true_image_shapes = None
                original_image_spatial_shapes = None

            eval_dict = eval_util.result_dict_for_batched_example(
                eval_images,
                features[inputs.HASH_KEY],
                detections,
                groundtruth,
                class_agnostic=class_agnostic,
                scale_to_absolute=True,
                original_image_spatial_shapes=original_image_spatial_shapes,
                true_image_shapes=true_image_shapes)

            if fields.InputDataFields.image_additional_channels in features:
                eval_dict[fields.InputDataFields.
                          image_additional_channels] = features[
                              fields.InputDataFields.image_additional_channels]

            if class_agnostic:
                category_index = label_map_util.create_class_agnostic_category_index(
                )
            else:
                category_index = label_map_util.create_category_index_from_labelmap(
                    eval_input_config.label_map_path)
            vis_metric_ops = None
            if not use_tpu and use_original_images:
                keypoint_edges = [(kp.start, kp.end)
                                  for kp in eval_config.keypoint_edge]

                eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections(
                    category_index,
                    max_examples_to_draw=eval_config.num_visualizations,
                    max_boxes_to_draw=eval_config.max_num_boxes_to_visualize,
                    min_score_thresh=eval_config.min_score_threshold,
                    use_normalized_coordinates=False,
                    keypoint_edges=keypoint_edges or None)
                vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops(
                    eval_dict)

            # Eval metrics on a single example.
            eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
                eval_config, list(category_index.values()), eval_dict)
            for loss_key, loss_tensor in iter(losses_dict.items()):
                eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
            for var in optimizer_summary_vars:
                eval_metric_ops[var.op.name] = (var, tf.no_op())
            if vis_metric_ops is not None:
                eval_metric_ops.update(vis_metric_ops)
            eval_metric_ops = {str(k): v for k, v in eval_metric_ops.items()}

            if eval_config.use_moving_averages:
                variable_averages = tf.train.ExponentialMovingAverage(0.0)
                variables_to_restore = variable_averages.variables_to_restore()
                keep_checkpoint_every_n_hours = (
                    train_config.keep_checkpoint_every_n_hours)
                saver = tf.train.Saver(
                    variables_to_restore,
                    keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours
                )
                scaffold = tf.train.Scaffold(saver=saver)

        # EVAL executes on CPU, so use regular non-TPU EstimatorSpec.
        if use_tpu and mode != tf.estimator.ModeKeys.EVAL:
            return tf.estimator.tpu.TPUEstimatorSpec(
                mode=mode,
                scaffold_fn=scaffold_fn,
                predictions=detections,
                loss=total_loss,
                train_op=train_op,
                eval_metrics=eval_metric_ops,
                export_outputs=export_outputs)
        else:
            if scaffold is None:
                keep_checkpoint_every_n_hours = (
                    train_config.keep_checkpoint_every_n_hours)
                saver = tf.train.Saver(
                    sharded=True,
                    keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
                    save_relative_paths=True)
                tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
                scaffold = tf.train.Scaffold(saver=saver)
            return tf.estimator.EstimatorSpec(mode=mode,
                                              predictions=detections,
                                              loss=total_loss,
                                              train_op=train_op,
                                              eval_metric_ops=eval_metric_ops,
                                              export_outputs=export_outputs,
                                              scaffold=scaffold)
示例#6
0
def prepare_eval_dict(detections, groundtruth, features):
  """Prepares eval dictionary containing detections and groundtruth.

  Takes in `detections` from the model, `groundtruth` and `features` returned
  from the eval tf.data.dataset and creates a dictionary of tensors suitable
  for detection eval modules.

  Args:
    detections: A dictionary of tensors returned by `model.postprocess`.
    groundtruth: `inputs.eval_input` returns an eval dataset of (features,
      labels) tuple. `groundtruth` must be set to `labels`.
      Please note that:
        * fields.InputDataFields.groundtruth_classes must be 0-indexed and
          in its 1-hot representation.
        * fields.InputDataFields.groundtruth_verified_neg_classes must be
          0-indexed and in its multi-hot repesentation.
        * fields.InputDataFields.groundtruth_not_exhaustive_classes must be
          0-indexed and in its multi-hot repesentation.
        * fields.InputDataFields.groundtruth_labeled_classes must be
          0-indexed and in its multi-hot repesentation.
    features: `inputs.eval_input` returns an eval dataset of (features, labels)
      tuple. This argument must be set to a dictionary containing the following
      keys and their corresponding values from `features` --
        * fields.InputDataFields.image
        * fields.InputDataFields.original_image
        * fields.InputDataFields.original_image_spatial_shape
        * fields.InputDataFields.true_image_shape
        * inputs.HASH_KEY

  Returns:
    eval_dict: A dictionary of tensors to pass to eval module.
    class_agnostic: Whether to evaluate detection in class agnostic mode.
  """

  groundtruth_boxes = groundtruth[fields.InputDataFields.groundtruth_boxes]
  groundtruth_boxes_shape = tf.shape(groundtruth_boxes)
  # For class-agnostic models, groundtruth one-hot encodings collapse to all
  # ones.
  class_agnostic = (
      fields.DetectionResultFields.detection_classes not in detections)
  if class_agnostic:
    groundtruth_classes_one_hot = tf.ones(
        [groundtruth_boxes_shape[0], groundtruth_boxes_shape[1], 1])
  else:
    groundtruth_classes_one_hot = groundtruth[
        fields.InputDataFields.groundtruth_classes]
  label_id_offset = 1  # Applying label id offset (b/63711816)
  groundtruth_classes = (
      tf.argmax(groundtruth_classes_one_hot, axis=2) + label_id_offset)
  groundtruth[fields.InputDataFields.groundtruth_classes] = groundtruth_classes

  label_id_offset_paddings = tf.constant([[0, 0], [1, 0]])
  if fields.InputDataFields.groundtruth_verified_neg_classes in groundtruth:
    groundtruth[
        fields.InputDataFields.groundtruth_verified_neg_classes] = tf.pad(
            groundtruth[
                fields.InputDataFields.groundtruth_verified_neg_classes],
            label_id_offset_paddings)
  if fields.InputDataFields.groundtruth_not_exhaustive_classes in groundtruth:
    groundtruth[
        fields.InputDataFields.groundtruth_not_exhaustive_classes] = tf.pad(
            groundtruth[
                fields.InputDataFields.groundtruth_not_exhaustive_classes],
            label_id_offset_paddings)
  if fields.InputDataFields.groundtruth_labeled_classes in groundtruth:
    groundtruth[fields.InputDataFields.groundtruth_labeled_classes] = tf.pad(
        groundtruth[fields.InputDataFields.groundtruth_labeled_classes],
        label_id_offset_paddings)

  use_original_images = fields.InputDataFields.original_image in features
  if use_original_images:
    eval_images = features[fields.InputDataFields.original_image]
    true_image_shapes = features[fields.InputDataFields.true_image_shape][:, :3]
    original_image_spatial_shapes = features[
        fields.InputDataFields.original_image_spatial_shape]
  else:
    eval_images = features[fields.InputDataFields.image]
    true_image_shapes = None
    original_image_spatial_shapes = None

  eval_dict = eval_util.result_dict_for_batched_example(
      eval_images,
      features[inputs.HASH_KEY],
      detections,
      groundtruth,
      class_agnostic=class_agnostic,
      scale_to_absolute=True,
      original_image_spatial_shapes=original_image_spatial_shapes,
      true_image_shapes=true_image_shapes)

  return eval_dict, class_agnostic
    def _make_evaluation_dict(self,
                              resized_groundtruth_masks=False,
                              batch_size=1,
                              max_gt_boxes=None,
                              scale_to_absolute=False):
        input_data_fields = standard_fields.InputDataFields
        detection_fields = standard_fields.DetectionResultFields

        image = tf.zeros(shape=[batch_size, 20, 20, 3], dtype=tf.uint8)
        if batch_size == 1:
            key = tf.constant('image1')
        else:
            key = tf.constant([str(i) for i in range(batch_size)])
        detection_boxes = tf.concat([
            tf.tile(tf.constant([[[0., 0., 1., 1.]]]),
                    multiples=[batch_size - 1, 1, 1]),
            tf.constant([[[0., 0., 0.5, 0.5]]])
        ],
                                    axis=0)
        detection_scores = tf.concat([
            tf.tile(tf.constant([[0.5]]), multiples=[batch_size - 1, 1]),
            tf.constant([[0.8]])
        ],
                                     axis=0)
        detection_classes = tf.tile(tf.constant([[0]]),
                                    multiples=[batch_size, 1])
        detection_masks = tf.tile(tf.ones(shape=[1, 1, 20, 20],
                                          dtype=tf.float32),
                                  multiples=[batch_size, 1, 1, 1])
        groundtruth_boxes = tf.constant([[0., 0., 1., 1.]])
        groundtruth_classes = tf.constant([1])
        groundtruth_instance_masks = tf.ones(shape=[1, 20, 20], dtype=tf.uint8)
        num_detections = tf.ones([batch_size])
        if resized_groundtruth_masks:
            groundtruth_instance_masks = tf.ones(shape=[1, 10, 10],
                                                 dtype=tf.uint8)

        if batch_size > 1:
            groundtruth_boxes = tf.tile(tf.expand_dims(groundtruth_boxes, 0),
                                        multiples=[batch_size, 1, 1])
            groundtruth_classes = tf.tile(tf.expand_dims(
                groundtruth_classes, 0),
                                          multiples=[batch_size, 1])
            groundtruth_instance_masks = tf.tile(
                tf.expand_dims(groundtruth_instance_masks, 0),
                multiples=[batch_size, 1, 1, 1])

        detections = {
            detection_fields.detection_boxes: detection_boxes,
            detection_fields.detection_scores: detection_scores,
            detection_fields.detection_classes: detection_classes,
            detection_fields.detection_masks: detection_masks,
            detection_fields.num_detections: num_detections
        }

        groundtruth = {
            input_data_fields.groundtruth_boxes:
            groundtruth_boxes,
            input_data_fields.groundtruth_classes:
            groundtruth_classes,
            input_data_fields.groundtruth_instance_masks:
            groundtruth_instance_masks,
        }
        if batch_size > 1:
            return eval_util.result_dict_for_batched_example(
                image,
                key,
                detections,
                groundtruth,
                scale_to_absolute=scale_to_absolute,
                max_gt_boxes=max_gt_boxes)
        else:
            return eval_util.result_dict_for_single_example(
                image,
                key,
                detections,
                groundtruth,
                scale_to_absolute=scale_to_absolute)
示例#8
0
    def model_fn(features, labels, mode, params=None):
        """Constructs the object detection model.

        Args:
            features: Dictionary of feature tensors, returned from `input_fn`.
            labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL,
            otherwise None.
            mode: Mode key from tf.estimator.ModeKeys.
            params: Parameter dictionary passed from the estimator.

        Returns:
            An `EstimatorSpec` that encapsulates the model and its serving
            configurations.
        """
        params = params or {}
        total_loss, train_op, detections, export_outputs = None, None, None, None
        is_training = mode == tf.estimator.ModeKeys.TRAIN

        # Make sure to set the Keras learning phase. True during training,
        # False for inference.
        tf.keras.backend.set_learning_phase(is_training)
        detection_model = detection_model_fn(is_training=is_training,
                                             add_summaries=(not use_tpu))

        scaffold_fn = None
        scaffold = None
        eval_metric_ops = None

        if mode == tf.estimator.ModeKeys.TRAIN:
            # get the optimizer and global step:
            global_step = tf.train.get_or_create_global_step()
            training_optimizer, optimizer_summary_vars = optimizer_builder.build(
                train_config.optimizer)

            #get the trainable variables
            #trainable_variables = None
            include_variables = (train_config.update_trainable_variables
                                 if train_config.update_trainable_variables
                                 else None)
            exclude_variables = (train_config.freeze_variables
                                 if train_config.freeze_variables else None)
            trainable_variables = tf.contrib.framework.filter_variables(
                tf.trainable_variables(),
                include_patterns=include_variables,
                exclude_patterns=exclude_variables)

            #get the clip_gradients_value
            clip_gradients_value = None
            if train_config.gradient_clipping_by_norm > 0:
                clip_gradients_value = train_config.gradient_clipping_by_norm

            total_loss = 0.
            tower_grads = []
            with tf.variable_scope(tf.get_variable_scope()):
                feature_list, label_list = split_features_and_labels(
                    features, labels, train_config.GPU_num)
                for i in xrange(train_config.GPU_num):
                    with tf.device('/gpu:%d' % i):
                        with tf.name_scope('%s_%d' % ('tower', i)) as scope:
                            loss = tower_loss(scope=scope,
                                              features=feature_list[i],
                                              labels=label_list[i],
                                              detection_model=detection_model,
                                              train_config=train_config)
                            tf.get_variable_scope().reuse_variables()
                            grads = training_optimizer.compute_gradients(
                                loss=loss)
                            if isinstance(clip_gradients_value, float):
                                grads = clip_gradients_by_norm(
                                    grads, clip_gradients_value)
                            tower_grads.append(grads)
                            total_loss += loss
            total_loss /= train_config.GPU_num
            grad_avg = average_gradients(tower_grads)

            with tf.control_dependencies(
                    tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                apply_gradient_op = training_optimizer.apply_gradients(
                    grads_and_vars=grad_avg, global_step=global_step)

            train_op = apply_gradient_op

            if train_config.fine_tune_checkpoint:
                if not train_config.fine_tune_checkpoint_type:
                    # train_config.from_detection_checkpoint field is deprecated. For
                    # backward compatibility, set train_config.fine_tune_checkpoint_type
                    # based on train_config.from_detection_checkpoint.
                    if train_config.from_detection_checkpoint:
                        train_config.fine_tune_checkpoint_type = 'detection'
                    else:
                        train_config.fine_tune_checkpoint_type = 'classification'
                asg_map = detection_model.restore_map(
                    fine_tune_checkpoint_type=train_config.
                    fine_tune_checkpoint_type,
                    load_all_detection_checkpoint_vars=(
                        train_config.load_all_detection_checkpoint_vars))
                available_var_map = (
                    variables_helper.get_variables_available_in_checkpoint(
                        asg_map,
                        train_config.fine_tune_checkpoint,
                        include_global_step=False))
                if use_tpu:

                    def tpu_scaffold():
                        tf.train.init_from_checkpoint(
                            train_config.fine_tune_checkpoint,
                            available_var_map)
                        return tf.train.Scaffold()

                    scaffold_fn = tpu_scaffold
                else:
                    tf.train.init_from_checkpoint(
                        train_config.fine_tune_checkpoint, available_var_map)

        elif mode == tf.estimator.ModeKeys.EVAL:
            detection_model = detection_model_fn(is_training=is_training,
                                                 add_summaries=(not use_tpu))
            # For evaling on train data, it is necessary to check whether groundtruth
            # must be unpadded.
            #in mode == tf.estimator.ModeKeys.EVAL or mode == tf.estimator.ModeKeys.PREDICT, I explictly set the evaluation and prediction to run on CPU
            with tf.device('/cpu:1'):
                # training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer )
                boxes_shape = (labels[fields.InputDataFields.
                                      groundtruth_boxes].get_shape().as_list())
                unpad_groundtruth_tensors = boxes_shape[
                    1] is not None and not use_tpu
                labels = unstack_batch(
                    labels,
                    unpad_groundtruth_tensors=unpad_groundtruth_tensors)

                gt_boxes_list = labels[
                    fields.InputDataFields.groundtruth_boxes]
                gt_classes_list = labels[
                    fields.InputDataFields.groundtruth_classes]
                gt_masks_list = None
                if fields.InputDataFields.groundtruth_instance_masks in labels:
                    gt_masks_list = labels[
                        fields.InputDataFields.groundtruth_instance_masks]
                gt_keypoints_list = None
                if fields.InputDataFields.groundtruth_keypoints in labels:
                    gt_keypoints_list = labels[
                        fields.InputDataFields.groundtruth_keypoints]
                gt_weights_list = None
                if fields.InputDataFields.groundtruth_weights in labels:
                    gt_weights_list = labels[
                        fields.InputDataFields.groundtruth_weights]
                gt_confidences_list = None
                if fields.InputDataFields.groundtruth_confidences in labels:
                    gt_confidences_list = labels[
                        fields.InputDataFields.groundtruth_confidences]
                gt_is_crowd_list = None
                if fields.InputDataFields.groundtruth_is_crowd in labels:
                    gt_is_crowd_list = labels[
                        fields.InputDataFields.groundtruth_is_crowd]
                detection_model.provide_groundtruth(
                    groundtruth_boxes_list=gt_boxes_list,
                    groundtruth_classes_list=gt_classes_list,
                    groundtruth_confidences_list=gt_confidences_list,
                    groundtruth_masks_list=gt_masks_list,
                    groundtruth_keypoints_list=gt_keypoints_list,
                    groundtruth_weights_list=gt_weights_list,
                    groundtruth_is_crowd_list=gt_is_crowd_list)

                training_optimizer, optimizer_summary_vars = optimizer_builder.build(
                    train_config.optimizer)

                preprocessed_images = features[fields.InputDataFields.image]
                if use_tpu and train_config.use_bfloat16:
                    with tf.contrib.tpu.bfloat16_scope():
                        prediction_dict = detection_model.predict(
                            preprocessed_images,
                            features[fields.InputDataFields.true_image_shape])
                    for k, v in prediction_dict.items():
                        if v.dtype == tf.bfloat16:
                            prediction_dict[k] = tf.cast(v, tf.float32)
                else:
                    prediction_dict = detection_model.predict(
                        preprocessed_images,
                        features[fields.InputDataFields.true_image_shape])

                detections = detection_model.postprocess(
                    prediction_dict,
                    features[fields.InputDataFields.true_image_shape])

                losses_dict = detection_model.loss(
                    prediction_dict,
                    features[fields.InputDataFields.true_image_shape])
                losses = [loss_tensor for loss_tensor in losses_dict.values()]
                if train_config.add_regularization_loss:
                    regularization_losses = detection_model.regularization_losses(
                    )
                    if regularization_losses:
                        regularization_loss = tf.add_n(
                            regularization_losses, name='regularization_loss')
                losses.append(regularization_loss)
                losses_dict['Loss/regularization_loss'] = regularization_loss
                total_loss = tf.add_n(losses, name='total_loss')
                losses_dict['Loss/total_loss'] = total_loss

                if 'graph_rewriter_config' in configs:
                    graph_rewriter_fn = graph_rewriter_builder.build(
                        configs['graph_rewriter_config'],
                        is_training=is_training)
                    graph_rewriter_fn()

                class_agnostic = (
                    fields.DetectionResultFields.detection_classes
                    not in detections)
                groundtruth = _prepare_groundtruth_for_eval(
                    detection_model, class_agnostic,
                    eval_input_config.max_number_of_boxes)
                use_original_images = fields.InputDataFields.original_image in features
                if use_original_images:
                    eval_images = features[
                        fields.InputDataFields.original_image]
                    true_image_shapes = tf.slice(
                        features[fields.InputDataFields.true_image_shape],
                        [0, 0], [-1, 3])
                    original_image_spatial_shapes = features[
                        fields.InputDataFields.original_image_spatial_shape]
                else:
                    eval_images = features[fields.InputDataFields.image]
                    true_image_shapes = None
                    original_image_spatial_shapes = None

                eval_dict = eval_util.result_dict_for_batched_example(
                    eval_images,
                    features[inputs.HASH_KEY],
                    detections,
                    groundtruth,
                    class_agnostic=class_agnostic,
                    scale_to_absolute=True,
                    original_image_spatial_shapes=original_image_spatial_shapes,
                    true_image_shapes=true_image_shapes)

                if class_agnostic:
                    category_index = label_map_util.create_class_agnostic_category_index(
                    )
                else:
                    category_index = label_map_util.create_category_index_from_labelmap(
                        eval_input_config.label_map_path)
                vis_metric_ops = None
                if not use_tpu and use_original_images:
                    eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections(
                        category_index,
                        max_examples_to_draw=eval_config.num_visualizations,
                        max_boxes_to_draw=eval_config.
                        max_num_boxes_to_visualize,
                        min_score_thresh=eval_config.min_score_threshold,
                        use_normalized_coordinates=False)
                    vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops(
                        eval_dict)

                # Eval metrics on a single example.
                eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
                    eval_config, category_index.values(), eval_dict)
                for loss_key, loss_tensor in iter(losses_dict.items()):
                    eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
                for var in optimizer_summary_vars:
                    eval_metric_ops[var.op.name] = (var, tf.no_op())
                if vis_metric_ops is not None:
                    eval_metric_ops.update(vis_metric_ops)
                eval_metric_ops = {
                    str(k): v
                    for k, v in eval_metric_ops.items()
                }

                if eval_config.use_moving_averages:
                    variable_averages = tf.train.ExponentialMovingAverage(0.0)
                    variables_to_restore = variable_averages.variables_to_restore(
                    )
                    keep_checkpoint_every_n_hours = (
                        train_config.keep_checkpoint_every_n_hours)
                    saver = tf.train.Saver(variables_to_restore,
                                           keep_checkpoint_every_n_hours=
                                           keep_checkpoint_every_n_hours)
                    scaffold = tf.train.Scaffold(saver=saver)

        elif mode == tf.estimator.ModeKeys.PREDICT:
            detection_model = detection_model_fn(is_training=is_training,
                                                 add_summaries=(not use_tpu))
            #similar to EVAL mode, I run PREDICT on CPU too.
            with tf.device(':/cpu:1'):
                preprocessed_images = features[fields.InputDataFields.image]

                if use_tpu and train_config.use_bfloat16:
                    with tf.contrib.tpu.bfloat16_scope():
                        prediction_dict = detection_model.predict(
                            preprocessed_images,
                            features[fields.InputDataFields.true_image_shape])
                        for k, v in prediction_dict.items():
                            if v.dtype == tf.bfloat16:
                                prediction_dict[k] = tf.cast(v, tf.float32)
                else:
                    prediction_dict = detection_model.predict(
                        preprocessed_images,
                        features[fields.InputDataFields.true_image_shape])

                detections = detection_model.postprocess(
                    prediction_dict,
                    features[fields.InputDataFields.true_image_shape])

                exported_output = exporter_lib.add_output_tensor_nodes(
                    detections)
                export_outputs = {
                    tf.saved_model.signature_constants.PREDICT_METHOD_NAME:
                    tf.estimator.export.PredictOutput(exported_output)
                }

        # EVAL executes on CPU, so use regular non-TPU EstimatorSpec.
        if use_tpu and mode != tf.estimator.ModeKeys.EVAL:
            return tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                scaffold_fn=scaffold_fn,
                predictions=detections,
                loss=total_loss,
                train_op=train_op,
                eval_metrics=eval_metric_ops,
                export_outputs=export_outputs)
        else:
            #scafold here only contains Saver
            if scaffold is None:
                keep_checkpoint_every_n_hours = (
                    train_config.keep_checkpoint_every_n_hours)
                saver = tf.train.Saver(
                    sharded=True,
                    keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
                    save_relative_paths=True)
                tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
                scaffold = tf.train.Scaffold(saver=saver)

            return tf.estimator.EstimatorSpec(mode=mode,
                                              predictions=detections,
                                              loss=total_loss,
                                              train_op=train_op,
                                              eval_metric_ops=eval_metric_ops,
                                              export_outputs=export_outputs,
                                              scaffold=scaffold)
示例#9
0
  def model_fn(features, labels, mode, params=None):
    """Constructs the object detection model.

    Args:
      features: Dictionary of feature tensors, returned from `input_fn`.
      labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL,
        otherwise None.
      mode: Mode key from tf.estimator.ModeKeys.
      params: Parameter dictionary passed from the estimator.

    Returns:
      An `EstimatorSpec` that encapsulates the model and its serving
        configurations.
    """
    params = params or {}
    total_loss, train_op, detections, export_outputs = None, None, None, None
    is_training = mode == tf.estimator.ModeKeys.TRAIN

    # Make sure to set the Keras learning phase. True during training,
    # False for inference.
    tf.keras.backend.set_learning_phase(is_training)
    detection_model = detection_model_fn(
        is_training=is_training, add_summaries=(not use_tpu))
    scaffold_fn = None

    if mode == tf.estimator.ModeKeys.TRAIN:
      labels = unstack_batch(
          labels,
          unpad_groundtruth_tensors=train_config.unpad_groundtruth_tensors)
    elif mode == tf.estimator.ModeKeys.EVAL:
      # For evaling on train data, it is necessary to check whether groundtruth
      # must be unpadded.
      boxes_shape = (
          labels[fields.InputDataFields.groundtruth_boxes].get_shape()
          .as_list())
      unpad_groundtruth_tensors = boxes_shape[1] is not None and not use_tpu
      labels = unstack_batch(
          labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)

    if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
      gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes]
      gt_classes_list = labels[fields.InputDataFields.groundtruth_classes]
      gt_masks_list = None
      if fields.InputDataFields.groundtruth_instance_masks in labels:
        gt_masks_list = labels[
            fields.InputDataFields.groundtruth_instance_masks]
      gt_keypoints_list = None
      if fields.InputDataFields.groundtruth_keypoints in labels:
        gt_keypoints_list = labels[fields.InputDataFields.groundtruth_keypoints]
      gt_weights_list = None
      if fields.InputDataFields.groundtruth_weights in labels:
        gt_weights_list = labels[fields.InputDataFields.groundtruth_weights]
      gt_confidences_list = None
      if fields.InputDataFields.groundtruth_confidences in labels:
        gt_confidences_list = labels[
            fields.InputDataFields.groundtruth_confidences]
      gt_is_crowd_list = None
      if fields.InputDataFields.groundtruth_is_crowd in labels:
        gt_is_crowd_list = labels[fields.InputDataFields.groundtruth_is_crowd]
      detection_model.provide_groundtruth(
          groundtruth_boxes_list=gt_boxes_list,
          groundtruth_classes_list=gt_classes_list,
          groundtruth_confidences_list=gt_confidences_list,
          groundtruth_masks_list=gt_masks_list,
          groundtruth_keypoints_list=gt_keypoints_list,
          groundtruth_weights_list=gt_weights_list,
          groundtruth_is_crowd_list=gt_is_crowd_list)

    preprocessed_images = features[fields.InputDataFields.image]
    if use_tpu and train_config.use_bfloat16:
      with tf.contrib.tpu.bfloat16_scope():
        prediction_dict = detection_model.predict(
            preprocessed_images,
            features[fields.InputDataFields.true_image_shape])
        for k, v in prediction_dict.items():
          if v.dtype == tf.bfloat16:
            prediction_dict[k] = tf.cast(v, tf.float32)
    else:
      prediction_dict = detection_model.predict(
          preprocessed_images,
          features[fields.InputDataFields.true_image_shape])

    def postprocess_wrapper(args):
      return detection_model.postprocess(args[0], args[1])

    if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT):
      if use_tpu and postprocess_on_cpu:
        detections = tf.contrib.tpu.outside_compilation(
            postprocess_wrapper,
            (prediction_dict,
             features[fields.InputDataFields.true_image_shape]))
      else:
        detections = postprocess_wrapper((
            prediction_dict,
            features[fields.InputDataFields.true_image_shape]))

    if mode == tf.estimator.ModeKeys.TRAIN:
      if train_config.fine_tune_checkpoint and hparams.load_pretrained:
        if not train_config.fine_tune_checkpoint_type:
          # train_config.from_detection_checkpoint field is deprecated. For
          # backward compatibility, set train_config.fine_tune_checkpoint_type
          # based on train_config.from_detection_checkpoint.
          if train_config.from_detection_checkpoint:
            train_config.fine_tune_checkpoint_type = 'detection'
          else:
            train_config.fine_tune_checkpoint_type = 'classification'
        asg_map = detection_model.restore_map(
            fine_tune_checkpoint_type=train_config.fine_tune_checkpoint_type,
            load_all_detection_checkpoint_vars=(
                train_config.load_all_detection_checkpoint_vars))
        available_var_map = (
            variables_helper.get_variables_available_in_checkpoint(
                asg_map,
                train_config.fine_tune_checkpoint,
                include_global_step=False))
        if use_tpu:

          def tpu_scaffold():
            tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint,
                                          available_var_map)
            return tf.train.Scaffold()

          scaffold_fn = tpu_scaffold
        else:
          tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint,
                                        available_var_map)

    if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
      losses_dict = detection_model.loss(
          prediction_dict, features[fields.InputDataFields.true_image_shape])
      losses = [loss_tensor for loss_tensor in losses_dict.values()]
      if train_config.add_regularization_loss:
        regularization_losses = detection_model.regularization_losses()
        if regularization_losses:
          regularization_loss = tf.add_n(
              regularization_losses, name='regularization_loss')
          losses.append(regularization_loss)
          losses_dict['Loss/regularization_loss'] = regularization_loss
      total_loss = tf.add_n(losses, name='total_loss')
      losses_dict['Loss/total_loss'] = total_loss

      if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
            configs['graph_rewriter_config'], is_training=is_training)
        graph_rewriter_fn()

      # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we
      # can write learning rate summaries on TPU without host calls.
      global_step = tf.train.get_or_create_global_step()
      training_optimizer, optimizer_summary_vars = optimizer_builder.build(
          train_config.optimizer)

    if mode == tf.estimator.ModeKeys.TRAIN:
      if use_tpu:
        training_optimizer = tf.contrib.tpu.CrossShardOptimizer(
            training_optimizer)

      # Optionally freeze some layers by setting their gradients to be zero.
      trainable_variables = None
      include_variables = (
          train_config.update_trainable_variables
          if train_config.update_trainable_variables else None)
      exclude_variables = (
          train_config.freeze_variables
          if train_config.freeze_variables else None)
      trainable_variables = tf.contrib.framework.filter_variables(
          tf.trainable_variables(),
          include_patterns=include_variables,
          exclude_patterns=exclude_variables)

      clip_gradients_value = None
      if train_config.gradient_clipping_by_norm > 0:
        clip_gradients_value = train_config.gradient_clipping_by_norm

      if not use_tpu:
        for var in optimizer_summary_vars:
          tf.summary.scalar(var.op.name, var)
      summaries = [] if use_tpu else None
      if train_config.summarize_gradients:
        summaries = ['gradients', 'gradient_norm', 'global_gradient_norm']
      train_op = tf.contrib.layers.optimize_loss(
          loss=total_loss,
          global_step=global_step,
          learning_rate=None,
          clip_gradients=clip_gradients_value,
          optimizer=training_optimizer,
          update_ops=detection_model.updates(),
          variables=trainable_variables,
          summaries=summaries,
          name='')  # Preventing scope prefix on all variables.

    if mode == tf.estimator.ModeKeys.PREDICT:
      exported_output = exporter_lib.add_output_tensor_nodes(detections)
      export_outputs = {
          tf.saved_model.signature_constants.PREDICT_METHOD_NAME:
              tf.estimator.export.PredictOutput(exported_output)
      }

    eval_metric_ops = None
    scaffold = None
    if mode == tf.estimator.ModeKeys.EVAL:
      class_agnostic = (
          fields.DetectionResultFields.detection_classes not in detections)
      groundtruth = _prepare_groundtruth_for_eval(
          detection_model, class_agnostic,
          eval_input_config.max_number_of_boxes)
      use_original_images = fields.InputDataFields.original_image in features
      if use_original_images:
        eval_images = features[fields.InputDataFields.original_image]
        true_image_shapes = tf.slice(
            features[fields.InputDataFields.true_image_shape], [0, 0], [-1, 3])
        original_image_spatial_shapes = features[fields.InputDataFields
                                                 .original_image_spatial_shape]
      else:
        eval_images = features[fields.InputDataFields.image]
        true_image_shapes = None
        original_image_spatial_shapes = None

      eval_dict = eval_util.result_dict_for_batched_example(
          eval_images,
          features[inputs.HASH_KEY],
          detections,
          groundtruth,
          class_agnostic=class_agnostic,
          scale_to_absolute=True,
          original_image_spatial_shapes=original_image_spatial_shapes,
          true_image_shapes=true_image_shapes)

      if class_agnostic:
        category_index = label_map_util.create_class_agnostic_category_index()
      else:
        category_index = label_map_util.create_category_index_from_labelmap(
            eval_input_config.label_map_path)
      vis_metric_ops = None
      if not use_tpu and use_original_images:
        eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections(
            category_index,
            max_examples_to_draw=eval_config.num_visualizations,
            max_boxes_to_draw=eval_config.max_num_boxes_to_visualize,
            min_score_thresh=eval_config.min_score_threshold,
            use_normalized_coordinates=False)
        vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops(
            eval_dict)

      # Eval metrics on a single example.
      eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
          eval_config, list(category_index.values()), eval_dict)
      for loss_key, loss_tensor in iter(losses_dict.items()):
        eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
      for var in optimizer_summary_vars:
        eval_metric_ops[var.op.name] = (var, tf.no_op())
      if vis_metric_ops is not None:
        eval_metric_ops.update(vis_metric_ops)
      eval_metric_ops = {str(k): v for k, v in eval_metric_ops.items()}

      if eval_config.use_moving_averages:
        variable_averages = tf.train.ExponentialMovingAverage(0.0)
        variables_to_restore = variable_averages.variables_to_restore()
        keep_checkpoint_every_n_hours = (
            train_config.keep_checkpoint_every_n_hours)
        saver = tf.train.Saver(
            variables_to_restore,
            keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours)
        scaffold = tf.train.Scaffold(saver=saver)

    # EVAL executes on CPU, so use regular non-TPU EstimatorSpec.
    if use_tpu and mode != tf.estimator.ModeKeys.EVAL:
      return tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          scaffold_fn=scaffold_fn,
          predictions=detections,
          loss=total_loss,
          train_op=train_op,
          eval_metrics=eval_metric_ops,
          export_outputs=export_outputs)
    else:
      if scaffold is None:
        keep_checkpoint_every_n_hours = (
            train_config.keep_checkpoint_every_n_hours)
        saver = tf.train.Saver(
            sharded=True,
            keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
            save_relative_paths=True)
        tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
        scaffold = tf.train.Scaffold(saver=saver)
      return tf.estimator.EstimatorSpec(
          mode=mode,
          predictions=detections,
          loss=total_loss,
          train_op=train_op,
          eval_metric_ops=eval_metric_ops,
          export_outputs=export_outputs,
          scaffold=scaffold)