示例#1
0
 def _save_checkpoint_from_mock_model(self,
                                      checkpoint_path,
                                      use_moving_averages,
                                      enable_quantization=False):
   g = tf.Graph()
   with g.as_default():
     mock_model = FakeModel()
     preprocessed_inputs, true_image_shapes = mock_model.preprocess(
         tf.placeholder(tf.float32, shape=[None, None, None, 3]))
     predictions = mock_model.predict(preprocessed_inputs, true_image_shapes)
     mock_model.postprocess(predictions, true_image_shapes)
     if use_moving_averages:
       tf.train.ExponentialMovingAverage(0.0).apply()
     tf.train.get_or_create_global_step()
     if enable_quantization:
       graph_rewriter_config = graph_rewriter_pb2.GraphRewriter()
       graph_rewriter_config.quantization.delay = 500000
       graph_rewriter_fn = graph_rewriter_builder.build(
           graph_rewriter_config, is_training=False)
       graph_rewriter_fn()
     saver = tf.train.Saver()
     init = tf.global_variables_initializer()
     with self.test_session() as sess:
       sess.run(init)
       saver.save(sess, checkpoint_path)
示例#2
0
  def test_rewrite_nn_resize_op_quantized(self):
    g = tf.Graph()
    with g.as_default():
      x = array_ops.placeholder(dtypes.float32, shape=(8, 10, 10, 8))
      x_conv = tf.contrib.slim.conv2d(x, 8, 1)
      y = array_ops.placeholder(dtypes.float32, shape=(8, 20, 20, 8))
      s = ops.nearest_neighbor_upsampling(x_conv, 2)
      t = s + y

      graph_rewriter_config = graph_rewriter_pb2.GraphRewriter()
      graph_rewriter_config.quantization.delay = 500000
      graph_rewriter_fn = graph_rewriter_builder.build(
          graph_rewriter_config, is_training=False)
      graph_rewriter_fn()

      exporter.rewrite_nn_resize_op(is_quantized=True)

    resize_op_found = False
    for op in g.get_operations():
      if op.type == 'ResizeNearestNeighbor':
        resize_op_found = True
        self.assertEqual(op.inputs[0].op.type, 'FakeQuantWithMinMaxVars')
        self.assertEqual(op.outputs[0].consumers()[0], t.op)
        break

    self.assertTrue(resize_op_found)
示例#3
0
文件: eval.py 项目: ALISCIFP/models
def main(unused_argv):
  assert FLAGS.checkpoint_dir, '`checkpoint_dir` is missing.'
  assert FLAGS.eval_dir, '`eval_dir` is missing.'
  tf.gfile.MakeDirs(FLAGS.eval_dir)
  if FLAGS.pipeline_config_path:
    configs = config_util.get_configs_from_pipeline_file(
        FLAGS.pipeline_config_path)
    tf.gfile.Copy(FLAGS.pipeline_config_path,
                  os.path.join(FLAGS.eval_dir, 'pipeline.config'),
                  overwrite=True)
  else:
    configs = config_util.get_configs_from_multiple_files(
        model_config_path=FLAGS.model_config_path,
        eval_config_path=FLAGS.eval_config_path,
        eval_input_config_path=FLAGS.input_config_path)
    for name, config in [('model.config', FLAGS.model_config_path),
                         ('eval.config', FLAGS.eval_config_path),
                         ('input.config', FLAGS.input_config_path)]:
      tf.gfile.Copy(config,
                    os.path.join(FLAGS.eval_dir, name),
                    overwrite=True)

  model_config = configs['model']
  eval_config = configs['eval_config']
  input_config = configs['eval_input_config']
  if FLAGS.eval_training_data:
    input_config = configs['train_input_config']

  model_fn = functools.partial(
      model_builder.build,
      model_config=model_config,
      is_training=False)

  def get_next(config):
    return dataset_util.make_initializable_iterator(
        dataset_builder.build(config)).get_next()

  create_input_dict_fn = functools.partial(get_next, input_config)

  label_map = label_map_util.load_labelmap(input_config.label_map_path)
  max_num_classes = max([item.id for item in label_map.item])
  categories = label_map_util.convert_label_map_to_categories(
      label_map, max_num_classes)

  if FLAGS.run_once:
    eval_config.max_evals = 1

  graph_rewriter_fn = None
  if 'graph_rewriter_config' in configs:
    graph_rewriter_fn = graph_rewriter_builder.build(
        configs['graph_rewriter_config'], is_training=False)

  evaluator.evaluate(
      create_input_dict_fn,
      model_fn,
      eval_config,
      categories,
      FLAGS.checkpoint_dir,
      FLAGS.eval_dir,
      graph_hook_fn=graph_rewriter_fn)
示例#4
0
def main(unused_argv):
    assert FLAGS.checkpoint_dir, '`checkpoint_dir` is missing.'
    assert FLAGS.eval_dir, '`eval_dir` is missing.'
    tf.gfile.MakeDirs(FLAGS.eval_dir)
    if FLAGS.pipeline_config_path:
        configs = config_util.get_configs_from_pipeline_file(
            FLAGS.pipeline_config_path)
        tf.gfile.Copy(FLAGS.pipeline_config_path,
                      os.path.join(FLAGS.eval_dir, 'pipeline.config'),
                      overwrite=True)
    else:
        configs = config_util.get_configs_from_multiple_files(
            model_config_path=FLAGS.model_config_path,
            eval_config_path=FLAGS.eval_config_path,
            eval_input_config_path=FLAGS.input_config_path)
        for name, config in [('model.config', FLAGS.model_config_path),
                             ('eval.config', FLAGS.eval_config_path),
                             ('input.config', FLAGS.input_config_path)]:
            tf.gfile.Copy(config,
                          os.path.join(FLAGS.eval_dir, name),
                          overwrite=True)

    model_config = configs['model']
    eval_config = configs['eval_config']
    input_config = configs['eval_input_config']
    if FLAGS.eval_training_data:
        input_config = configs['train_input_config']

    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=False)

    def get_next(config):
        return dataset_builder.make_initializable_iterator(
            dataset_builder.build(config)).get_next()

    create_input_dict_fn = functools.partial(get_next, input_config)

    label_map = label_map_util.load_labelmap(input_config.label_map_path)
    max_num_classes = max([item.id for item in label_map.item])
    categories = label_map_util.convert_label_map_to_categories(
        label_map, max_num_classes)

    if FLAGS.run_once:
        eval_config.max_evals = 1

    graph_rewriter_fn = None
    if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
            configs['graph_rewriter_config'], is_training=False)

    evaluator.evaluate(create_input_dict_fn,
                       model_fn,
                       eval_config,
                       categories,
                       FLAGS.checkpoint_dir,
                       FLAGS.eval_dir,
                       graph_hook_fn=graph_rewriter_fn)
示例#5
0
def export_inference_graph(input_type,
                           pipeline_config,
                           trained_checkpoint_prefix,
                           output_directory,
                           input_shape=None,
                           output_collection_name='inference_op',
                           additional_output_tensor_names=None,
                           write_inference_graph=False,
                           use_side_inputs=False,
                           side_input_shapes=None,
                           side_input_names=None,
                           side_input_types=None):
    """Exports inference graph for the model specified in the pipeline config.

  Args:
    input_type: Type of input for the graph. Can be one of ['image_tensor',
      'encoded_image_string_tensor', 'tf_example'].
    pipeline_config: pipeline_pb2.TrainAndEvalPipelineConfig proto.
    trained_checkpoint_prefix: Path to the trained checkpoint file.
    output_directory: Path to write outputs.
    input_shape: Sets a fixed shape for an `image_tensor` input. If not
      specified, will default to [None, None, None, 3].
    output_collection_name: Name of collection to add output tensors to.
      If None, does not add output tensors to a collection.
    additional_output_tensor_names: list of additional output
      tensors to include in the frozen graph.
    write_inference_graph: If true, writes inference graph to disk.
    use_side_inputs: If True, the model requires side_inputs.
    side_input_shapes: List of shapes of the side input tensors,
      required if use_side_inputs is True.
    side_input_names: List of names of the side input tensors,
      required if use_side_inputs is True.
    side_input_types: List of types of the side input tensors,
      required if use_side_inputs is True.
  """
    detection_model = model_builder.build(pipeline_config.model,
                                          is_training=False)
    graph_rewriter_fn = None
    if pipeline_config.HasField('graph_rewriter'):
        graph_rewriter_config = pipeline_config.graph_rewriter
        graph_rewriter_fn = graph_rewriter_builder.build(graph_rewriter_config,
                                                         is_training=False)
    _export_inference_graph(input_type,
                            detection_model,
                            pipeline_config.eval_config.use_moving_averages,
                            trained_checkpoint_prefix,
                            output_directory,
                            additional_output_tensor_names,
                            input_shape,
                            output_collection_name,
                            graph_hook_fn=graph_rewriter_fn,
                            write_inference_graph=write_inference_graph,
                            use_side_inputs=use_side_inputs,
                            side_input_shapes=side_input_shapes,
                            side_input_names=side_input_names,
                            side_input_types=side_input_types)
    pipeline_config.eval_config.use_moving_averages = False
    config_util.save_pipeline_config(pipeline_config, output_directory)
示例#6
0
def evaluate(eval_dir, config_dir, checkpoint_dir, eval_training_data=False):
    '''
        Function used to evaluate your trained model. 

        Args: 
            Required:               
                eval_dir: The directory where the tfevent file will be saved.
                config_dir: The protobuf configuration directory.
                checkpoint_dir: The directory where the checkpoint you want to evaluate is.
            
            Optional:
                eval_training_data: Is set to True the evaluation will be run on the training dataset.

        Returns:
            A dictionnary of metrics ready to be sent to the picsell.ia platform.
    '''

    tf.reset_default_graph()
    tf.gfile.MakeDirs(eval_dir)
    configs = config_util.get_configs_from_pipeline_file(
        os.path.join(config_dir, "pipeline.config"))
    model_config = configs['model']
    eval_config = configs['eval_config']
    input_config = configs['eval_input_config']
    if eval_training_data:
        input_config = configs['train_input_config']

    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=False)

    def get_next(config):
        return dataset_builder.make_initializable_iterator(
            dataset_builder.build(config)).get_next()

    create_input_dict_fn = functools.partial(get_next, input_config)

    categories = label_map_util.create_categories_from_labelmap(
        input_config.label_map_path)

    eval_config.max_evals = 1

    graph_rewriter_fn = None
    if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
            configs['graph_rewriter_config'], is_training=False)

    metrics = evaluator.evaluate(create_input_dict_fn,
                                 model_fn,
                                 eval_config,
                                 categories,
                                 checkpoint_dir,
                                 eval_dir,
                                 graph_hook_fn=graph_rewriter_fn)
    return {k: str(round(v, 3)) for k, v in metrics.items()}
示例#7
0
def run_eval(checkpoint_dir, eval_dir, pipeline_config_path, num_examples):
    run_once = False
    assert checkpoint_dir, '`checkpoint_dir` is missing.'
    assert eval_dir, '`eval_dir` is missing.'
    tf.gfile.MakeDirs(eval_dir)
    if pipeline_config_path:
        configs = config_util.get_configs_from_pipeline_file(
            pipeline_config_path)
        tf.gfile.Copy(pipeline_config_path,
                      os.path.join(eval_dir, 'pipeline.config'),
                      overwrite=True)
    else:
        return

    model_config = configs['model']
    eval_config = configs['eval_config']
    input_config = configs['eval_input_config']
    # if eval_training_data:
    #   input_config = configs['train_input_config']

    eval_config.num_examples = num_examples
    eval_config.max_evals = 1

    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=False)

    def get_next(config):
        return dataset_builder.make_initializable_iterator(
            dataset_builder.build(config)).get_next()

    create_input_dict_fn = functools.partial(get_next, input_config)

    label_map = label_map_util.load_labelmap(input_config.label_map_path)
    max_num_classes = max([item.id for item in label_map.item])
    categories = label_map_util.convert_label_map_to_categories(
        label_map, max_num_classes)

    if run_once:
        eval_config.max_evals = 1

    graph_rewriter_fn = None
    if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
            configs['graph_rewriter_config'], is_training=False)

    result = evaluator.evaluate(create_input_dict_fn,
                                model_fn,
                                eval_config,
                                categories,
                                checkpoint_dir,
                                eval_dir,
                                graph_hook_fn=graph_rewriter_fn)

    return result
示例#8
0
def main(_):
    assert FLAGS.train_dir, '`train_dir` is missing.'
    tf.gfile.MakeDirs(FLAGS.train_dir)
    assert FLAGS.pipeline_config_path, '`pipeline_config_path` is missing.'
    configs = config_util.get_configs_from_pipeline_file(
        FLAGS.pipeline_config_path)
    tf.gfile.Copy(FLAGS.pipeline_config_path,
                  os.path.join(FLAGS.train_dir, 'pipeline.config'),
                  overwrite=True)

    model_config = configs['model']
    train_config = configs['train_config']
    input_config = configs['train_input_config']

    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=True)

    def get_next(config):
        return dataset_builder.make_initializable_iterator(
            dataset_builder.build(config)).get_next()

    create_input_dict_fn = functools.partial(get_next, input_config)

    # Parameters for a single worker.
    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''

    num_clones = 1
    clone_on_cpu = False

    graph_rewriter_fn = None
    if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
            configs['graph_rewriter_config'], is_training=True)

    trainer.train(create_input_dict_fn,
                  model_fn,
                  train_config,
                  master,
                  task,
                  num_clones,
                  worker_replicas,
                  clone_on_cpu,
                  ps_tasks,
                  worker_job_name,
                  is_chief,
                  FLAGS.train_dir,
                  graph_hook_fn=graph_rewriter_fn)
 def testQuantizationBuilderSetsUpCorrectEvalArguments(self):
   with mock.patch.object(tf.contrib.quantize,
                          'experimental_create_eval_graph') as mock_quant_fn:
     with mock.patch.object(tf.contrib.layers,
                            'summarize_collection') as mock_summarize_col:
       graph_rewriter_proto = graph_rewriter_pb2.GraphRewriter()
       graph_rewriter_proto.quantization.delay = 10
       graph_rewrite_fn = graph_rewriter_builder.build(
           graph_rewriter_proto, is_training=False)
       graph_rewrite_fn()
       _, kwargs = mock_quant_fn.call_args
       self.assertEqual(kwargs['input_graph'], tf.get_default_graph())
       mock_summarize_col.assert_called_with('quant_vars')
 def testQuantizationBuilderSetsUpCorrectEvalArguments(self):
   with mock.patch.object(tf.contrib.quantize,
                          'create_eval_graph') as mock_quant_fn:
     with mock.patch.object(tf.contrib.layers,
                            'summarize_collection') as mock_summarize_col:
       graph_rewriter_proto = graph_rewriter_pb2.GraphRewriter()
       graph_rewriter_proto.quantization.delay = 10
       graph_rewrite_fn = graph_rewriter_builder.build(
           graph_rewriter_proto, is_training=False)
       graph_rewrite_fn()
       _, kwargs = mock_quant_fn.call_args
       self.assertEqual(kwargs['input_graph'], tf.get_default_graph())
       mock_summarize_col.assert_called_with('quant_vars')
 def testQuantizationBuilderSetsUpCorrectTrainArguments(self):
   with mock.patch.object(
       tf.contrib.quantize, 'create_training_graph') as mock_quant_fn:
     with mock.patch.object(tf.contrib.layers,
                            'summarize_collection') as mock_summarize_col:
       graph_rewriter_proto = graph_rewriter_pb2.GraphRewriter()
       graph_rewriter_proto.quantization.delay = 10
       graph_rewriter_proto.quantization.weight_bits = 8
       graph_rewriter_proto.quantization.activation_bits = 8
       graph_rewrite_fn = graph_rewriter_builder.build(
           graph_rewriter_proto, is_training=True)
       graph_rewrite_fn()
       _, kwargs = mock_quant_fn.call_args
       self.assertEqual(kwargs['input_graph'], tf.get_default_graph())
       self.assertEqual(kwargs['quant_delay'], 10)
       mock_summarize_col.assert_called_with('quant_vars')
示例#12
0
def export_inference_graph(input_type,
                           pipeline_config,
                           trained_checkpoint_prefix,
                           output_directory,
                           input_shape=None,
                           output_collection_name='inference_op',
                           additional_output_tensor_names=None,
                           pipeline_config_file=None,
                           half=False):
    """Exports inference graph for the model specified in the pipeline config.

  Args:
    input_type: Type of input for the graph. Can be one of [`image_tensor`,
      `tf_example`].
    pipeline_config: pipeline_pb2.TrainAndEvalPipelineConfig proto.
    trained_checkpoint_prefix: Path to the trained checkpoint file.
    output_directory: Path to write outputs.
    input_shape: Sets a fixed shape for an `image_tensor` input. If not
      specified, will default to [None, None, None, 3].
    output_collection_name: Name of collection to add output tensors to.
      If None, does not add output tensors to a collection.
    additional_output_tensor_names: list of additional output
      tensors to include in the frozen graph.
  """
    detection_model = model_builder.build(pipeline_config.model,
                                          is_training=False)
    configs = config_util.get_configs_from_pipeline_file(pipeline_config_file)
    graph_rewriter_fn = None
    if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
            configs['graph_rewriter_config'], is_training=False)
    _export_inference_graph(input_type,
                            detection_model,
                            pipeline_config,
                            trained_checkpoint_prefix,
                            output_directory,
                            additional_output_tensor_names,
                            input_shape,
                            output_collection_name,
                            graph_hook_fn=graph_rewriter_fn,
                            half=half)
    pipeline_config.eval_config.use_moving_averages = False
    config_text = text_format.MessageToString(pipeline_config)
    with tf.gfile.Open(os.path.join(output_directory, 'pipeline.config'),
                       'wb') as f:
        f.write(config_text)
示例#13
0
def main(unused_argv):
    assert FLAGS.checkpoint_dir, '`checkpoint_dir` is missing.'
    assert FLAGS.eval_dir, '`eval_dir` is missing.'
    tf.gfile.MakeDirs(FLAGS.eval_dir)
    configs = config_util.get_configs_from_pipeline_file(
        FLAGS.pipeline_config_path)
    tf.gfile.Copy(FLAGS.pipeline_config_path,
                  os.path.join(FLAGS.eval_dir, 'pipeline.config'),
                  overwrite=True)

    model_config = configs['model']
    eval_config = configs['eval_config']
    input_config = configs['eval_input_config']
    if FLAGS.eval_training_data:
        input_config = configs['train_input_config']

    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=False)

    def get_next(config):
        return dataset_builder.make_initializable_iterator(
            dataset_builder.build(config)).get_next()

    create_input_dict_fn = functools.partial(get_next, input_config)

    categories = label_map_util.create_categories_from_labelmap(
        input_config.label_map_path)

    if FLAGS.run_once:
        eval_config.max_evals = 1

    graph_rewriter_fn = None
    if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
            configs['graph_rewriter_config'], is_training=False)

    evaluator.evaluate(create_input_dict_fn,
                       model_fn,
                       eval_config,
                       categories,
                       FLAGS.checkpoint_dir,
                       FLAGS.eval_dir,
                       graph_hook_fn=graph_rewriter_fn)
示例#14
0
def export_inference_graph(input_type,
                           pipeline_config,
                           trained_checkpoint_prefix,
                           output_directory,
                           input_shape=None,
                           output_collection_name='inference_op',
                           additional_output_tensor_names=None,
                           write_inference_graph=False):
  """Exports inference graph for the model specified in the pipeline config.

  Args:
    input_type: Type of input for the graph. Can be one of ['image_tensor',
      'encoded_image_string_tensor', 'tf_example'].
    pipeline_config: pipeline_pb2.TrainAndEvalPipelineConfig proto.
    trained_checkpoint_prefix: Path to the trained checkpoint file.
    output_directory: Path to write outputs.
    input_shape: Sets a fixed shape for an `image_tensor` input. If not
      specified, will default to [None, None, None, 3].
    output_collection_name: Name of collection to add output tensors to.
      If None, does not add output tensors to a collection.
    additional_output_tensor_names: list of additional output
      tensors to include in the frozen graph.
    write_inference_graph: If true, writes inference graph to disk.
  """
  detection_model = model_builder.build(pipeline_config.model,
                                        is_training=False)
  graph_rewriter_fn = None
  if pipeline_config.HasField('graph_rewriter'):
    graph_rewriter_config = pipeline_config.graph_rewriter
    graph_rewriter_fn = graph_rewriter_builder.build(graph_rewriter_config,
                                                     is_training=False)
  _export_inference_graph(
      input_type,
      detection_model,
      pipeline_config.eval_config.use_moving_averages,
      trained_checkpoint_prefix,
      output_directory,
      additional_output_tensor_names,
      input_shape,
      output_collection_name,
      graph_hook_fn=graph_rewriter_fn,
      write_inference_graph=write_inference_graph)
  pipeline_config.eval_config.use_moving_averages = False
  config_util.save_pipeline_config(pipeline_config, output_directory)
示例#15
0
    def _train(self):
        tf.disable_eager_execution()
        ps_tasks = 0
        worker_replicas = 1
        worker_job_name = 'lonely_worker'
        task = 0
        is_chief = True
        master = ''
        graph_rewriter_fn = None
        # loading and reading  the config file
        configs = create_configs_from_pipeline_proto(self.pipeline)
        model_config = configs['model']
        train_config = configs['train_config']
        input_config = configs['train_input_config']
        # creating the tf object detection api model (from the config parameters)
        model_fn = functools.partial(model_builder.build,
                                     model_config=model_config,
                                     is_training=True)

        def get_next(config):
            return dataset_builder.make_initializable_iterator(
                dataset_builder.build(config)).get_next()

        create_input_dict_fn = functools.partial(get_next, input_config)
        if 'graph_rewriter_config' in configs:
            graph_rewriter_fn = graph_rewriter_builder.build(
                configs['graph_rewriter_config'], is_training=True)
        # training the model with the new parameters
        trainer.train(create_input_dict_fn,
                      model_fn,
                      train_config,
                      master,
                      task,
                      1,
                      worker_replicas,
                      False,
                      ps_tasks,
                      worker_job_name,
                      is_chief,
                      str(self._out_folder),
                      graph_hook_fn=graph_rewriter_fn)
示例#16
0
def export_tflite_graph(pipeline_config, trained_checkpoint_prefix, output_dir,
                        add_postprocessing_op, max_detections,
                        max_classes_per_detection):
  """Exports a tflite compatible graph and anchors for ssd detection model.

  Anchors are written to a tensor and tflite compatible graph
  is written to output_dir/tflite_graph.pb.

  Args:
    pipeline_config: a pipeline.proto object containing the configuration for
      SSD model to export.
    trained_checkpoint_prefix: a file prefix for the checkpoint containing the
      trained parameters of the SSD model.
    output_dir: A directory to write the tflite graph and anchor file to.
    add_postprocessing_op: If add_postprocessing_op is true: frozen graph adds a
      TFLite_Detection_PostProcess custom op
    max_detections: Maximum number of detections (boxes) to show
    max_classes_per_detection: Number of classes to display per detection


  Raises:
    ValueError: if the pipeline config contains models other than ssd or uses an
      fixed_shape_resizer and provides a shape as well.
  """
  tf.gfile.MakeDirs(output_dir)
  if pipeline_config.model.WhichOneof('model') != 'ssd':
    raise ValueError('Only ssd models are supported in tflite. '
                     'Found {} in config'.format(
                         pipeline_config.model.WhichOneof('model')))

  num_classes = pipeline_config.model.ssd.num_classes
  nms_score_threshold = {
      pipeline_config.model.ssd.post_processing.batch_non_max_suppression.
      score_threshold
  }
  nms_iou_threshold = {
      pipeline_config.model.ssd.post_processing.batch_non_max_suppression.
      iou_threshold
  }
  scale_values = {}
  scale_values['y_scale'] = {
      pipeline_config.model.ssd.box_coder.faster_rcnn_box_coder.y_scale
  }
  scale_values['x_scale'] = {
      pipeline_config.model.ssd.box_coder.faster_rcnn_box_coder.x_scale
  }
  scale_values['h_scale'] = {
      pipeline_config.model.ssd.box_coder.faster_rcnn_box_coder.height_scale
  }
  scale_values['w_scale'] = {
      pipeline_config.model.ssd.box_coder.faster_rcnn_box_coder.width_scale
  }

  image_resizer_config = pipeline_config.model.ssd.image_resizer
  image_resizer = image_resizer_config.WhichOneof('image_resizer_oneof')
  num_channels = _DEFAULT_NUM_CHANNELS
  if image_resizer == 'fixed_shape_resizer':
    height = image_resizer_config.fixed_shape_resizer.height
    width = image_resizer_config.fixed_shape_resizer.width
    if image_resizer_config.fixed_shape_resizer.convert_to_grayscale:
      num_channels = 1
    shape = [1, height, width, num_channels]
  else:
    raise ValueError(
        'Only fixed_shape_resizer'
        'is supported with tflite. Found {}'.format(
            image_resizer_config.WhichOneof('image_resizer_oneof')))

  image = tf.placeholder(
      tf.float32, shape=shape, name='normalized_input_image_tensor')

  detection_model = model_builder.build(
      pipeline_config.model, is_training=False)
  predicted_tensors = detection_model.predict(image, true_image_shapes=None)
  # The score conversion occurs before the post-processing custom op
  _, score_conversion_fn = post_processing_builder.build(
      pipeline_config.model.ssd.post_processing)
  class_predictions = score_conversion_fn(
      predicted_tensors['class_predictions_with_background'])

  with tf.name_scope('raw_outputs'):
    # 'raw_outputs/box_encodings': a float32 tensor of shape [1, num_anchors, 4]
    #  containing the encoded box predictions. Note that these are raw
    #  predictions and no Non-Max suppression is applied on them and
    #  no decode center size boxes is applied to them.
    tf.identity(predicted_tensors['box_encodings'], name='box_encodings')
    # 'raw_outputs/class_predictions': a float32 tensor of shape
    #  [1, num_anchors, num_classes] containing the class scores for each anchor
    #  after applying score conversion.
    tf.identity(class_predictions, name='class_predictions')
  # 'anchors': a float32 tensor of shape
  #   [4, num_anchors] containing the anchors as a constant node.
  tf.identity(
      get_const_center_size_encoded_anchors(predicted_tensors['anchors']),
      name='anchors')

  # Add global step to the graph, so we know the training step number when we
  # evaluate the model.
  tf.train.get_or_create_global_step()

  # graph rewriter
  is_quantized = pipeline_config.HasField('graph_rewriter')
  if is_quantized:
    graph_rewriter_config = pipeline_config.graph_rewriter
    graph_rewriter_fn = graph_rewriter_builder.build(
        graph_rewriter_config, is_training=False)
    graph_rewriter_fn()

  if pipeline_config.model.ssd.feature_extractor.HasField('fpn'):
    exporter.rewrite_nn_resize_op(is_quantized)

  # freeze the graph
  saver_kwargs = {}
  if pipeline_config.eval_config.use_moving_averages:
    saver_kwargs['write_version'] = saver_pb2.SaverDef.V1
    moving_average_checkpoint = tempfile.NamedTemporaryFile()
    exporter.replace_variable_values_with_moving_averages(
        tf.get_default_graph(), trained_checkpoint_prefix,
        moving_average_checkpoint.name)
    checkpoint_to_use = moving_average_checkpoint.name
  else:
    checkpoint_to_use = trained_checkpoint_prefix

  saver = tf.train.Saver(**saver_kwargs)
  input_saver_def = saver.as_saver_def()
  frozen_graph_def = exporter.freeze_graph_with_def_protos(
      input_graph_def=tf.get_default_graph().as_graph_def(),
      input_saver_def=input_saver_def,
      input_checkpoint=checkpoint_to_use,
      output_node_names=','.join([
          'raw_outputs/box_encodings', 'raw_outputs/class_predictions',
          'anchors'
      ]),
      restore_op_name='save/restore_all',
      filename_tensor_name='save/Const:0',
      clear_devices=True,
      output_graph='',
      initializer_nodes='')

  # Add new operation to do post processing in a custom op (TF Lite only)
  if add_postprocessing_op:
    transformed_graph_def = append_postprocessing_op(
        frozen_graph_def, max_detections, max_classes_per_detection,
        nms_score_threshold, nms_iou_threshold, num_classes, scale_values)
  else:
    # Return frozen without adding post-processing custom op
    transformed_graph_def = frozen_graph_def

  binary_graph = os.path.join(output_dir, 'tflite_graph.pb')
  with tf.gfile.GFile(binary_graph, 'wb') as f:
    f.write(transformed_graph_def.SerializeToString())
  txt_graph = os.path.join(output_dir, 'tflite_graph.pbtxt')
  with tf.gfile.GFile(txt_graph, 'w') as f:
    f.write(str(transformed_graph_def))
示例#17
0
def main(_):
  assert FLAGS.train_dir, '`train_dir` is missing.'
  assert FLAGS.pipeline_config_path, '`pipeline_config_path` is missing'
  assert FLAGS.eval_dir, '`eval_dir` is missing.'

  configs = config_util.get_configs_from_pipeline_file(
      FLAGS.pipeline_config_path)
  if FLAGS.task == 0:
    tf.gfile.MakeDirs(FLAGS.train_dir)
    tf.gfile.Copy(FLAGS.pipeline_config_path,
                  os.path.join(FLAGS.train_dir, 'pipeline.config'),
                  overwrite=True)

  tf.gfile.MakeDirs(FLAGS.eval_dir)
  tf.gfile.Copy(FLAGS.pipeline_config_path,
                os.path.join(FLAGS.eval_dir, 'pipeline.config'),
                overwrite=True)

  model_config = configs['model']

  train_config = configs['train_config']
  train_input_config = configs['train_input_config']

  eval_config = configs['eval_config']
  if FLAGS.eval_training_data:
    eval_input_config = configs['train_input_config']
  else:
    eval_input_config = configs['eval_input_config']

  # setting to run evaluation after EPOCHS_BETWEEN_EVALS epochs of training.
  # total number of training is set to total_num_epochs provided in the config
  if train_config.num_steps:
    total_num_epochs = train_config.num_steps
    train_config.num_steps = FLAGS.epochs_between_evals
    total_training_cycle = total_num_epochs // train_config.num_steps
  else:
    # TODO(mehdi): make it run indef
    total_num_epochs = 20000000
    train_config.num_steps = FLAGS.epochs_between_evals
    total_training_cycle = total_num_epochs // train_config.num_steps

  train_model_fn = functools.partial(model_builder.build,
                                     model_config=model_config,
                                     is_training=True)
  eval_model_fn = functools.partial(model_builder.build,
                                    model_config=model_config,
                                    is_training=False)

  def get_next(config):
    return dataset_util.make_initializable_iterator(
        dataset_builder.build(config)).get_next()

  # functions to create a tensor input dictionary for both training & evaluation
  train_input_dict_fn = functools.partial(get_next, train_input_config)
  eval_input_dict_fn = functools.partial(get_next, eval_input_config)

  # If not explicitly specified in the constructor and the TF_CONFIG
  # environment variable is present, load cluster_spec from TF_CONFIG.
  env = json.loads(os.environ.get('TF_CONFIG', '{}'))
  cluster_data = env.get('cluster', None)
  cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
  task_data = env.get('task', {'type': 'master', 'index': 0})
  task_info = type('TaskSpec', (object,), task_data)

  # Parameters for a single worker.
  parameter_server_tasks = 0
  worker_replicas = 1
  worker_job_name = 'lonely_worker'
  task = 0
  is_chief = True
  master = ''

  if cluster_data and 'worker' in cluster_data:
    # Number of total worker replicas include "worker"s and the "master".
    worker_replicas = len(cluster_data['worker']) + 1
  if cluster_data and 'ps' in cluster_data:
    parameter_server_tasks = len(cluster_data['ps'])

  if worker_replicas > 1 and parameter_server_tasks < 1:
    raise ValueError('At least 1 ps task is needed for distributed training.')

  if worker_replicas >= 1 and parameter_server_tasks > 0:
    # Set up distributed training.
    server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc',
                             job_name=task_info.type,
                             task_index=task_info.index)
    if task_info.type == 'ps':
      server.join()
      return

    worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
    task = task_info.index
    is_chief = (task_info.type == 'master')
    master = server.target

  label_map = label_map_util.load_labelmap(eval_input_config.label_map_path)
  max_num_classes = max([item.id for item in label_map.item])
  categories = label_map_util.convert_label_map_to_categories(label_map,
                                                              max_num_classes)

  if FLAGS.run_once:
    eval_config.max_evals = 1

  train_graph_rewriter_fn = eval_graph_rewriter_fn = None
  if 'graph_rewriter_config' in configs:
    train_graph_rewriter_fn = graph_rewriter_builder.build(
        configs['graph_rewriter_config'], is_training=True)
    eval_graph_rewriter_fn = graph_rewriter_builder.build(
        configs['eval_rewriter_config'], is_training=False)

  def train():
    return trainer.train(create_tensor_dict_fn=train_input_dict_fn,
                         create_model_fn=train_model_fn,
                         train_config=train_config, master=master, task=task,
                         num_clones=FLAGS.num_clones,
                         worker_replicas=worker_replicas,
                         clone_on_cpu=FLAGS.clone_on_cpu,
                         ps_tasks=parameter_server_tasks,
                         worker_job_name=worker_job_name,
                         is_chief=is_chief, train_dir=FLAGS.train_dir,
                         graph_hook_fn=train_graph_rewriter_fn)

  def evaluate():
    return evaluator.evaluate(eval_input_dict_fn, eval_model_fn, eval_config,
                              categories, FLAGS.train_dir, FLAGS.eval_dir,
                              graph_hook_fn=eval_graph_rewriter_fn)

  for cycle_index in range(total_training_cycle):
    tf.logging.info('Starting a training cycle: %d/%d',
                    cycle_index, total_training_cycle)
    train()
    tf.logging.info('Starting to evaluate.')
    eval_metrics = evaluate()
    if stopping_criteria_met(eval_metrics, FLAGS.mask_min_ap, FLAGS.box_min_ap):
      tf.logging.info('Stopping criteria met. Training stopped')
      break
示例#18
0
  server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc',
                           job_name=task_info.type,
                          task_index=task_info.index)

  if task_info.type == 'ps':
    server.join()
    #return

  worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
  task = task_info.index
  is_chief = (task_info.type == 'master')
  master = server.target

graph_rewriter_fn = None
if 'graph_rewriter_config' in configs:
  graph_rewriter_fn = graph_rewriter_builder.build(
      configs['graph_rewriter_config'], is_training=True)

trainer.train(
    create_input_dict_fn,
    model_fn,
    train_config,
    master,
    task,
    FLAGS.num_clones,
    worker_replicas,
    FLAGS.clone_on_cpu,
    ps_tasks,
    worker_job_name,
    is_chief,
    train_dir,
    graph_hook_fn=graph_rewriter_fn)
示例#19
0
def main(_):
    assert FLAGS.train_dir, '`train_dir` is missing.'
    if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir)
    if FLAGS.pipeline_config_path:
        configs = config_util.get_configs_from_pipeline_file(
            FLAGS.pipeline_config_path)
        if FLAGS.task == 0:
            tf.gfile.Copy(FLAGS.pipeline_config_path,
                          os.path.join(FLAGS.train_dir, 'pipeline.config'),
                          overwrite=True)
    else:
        configs = config_util.get_configs_from_multiple_files(
            model_config_path=FLAGS.model_config_path,
            train_config_path=FLAGS.train_config_path,
            train_input_config_path=FLAGS.input_config_path)
        if FLAGS.task == 0:
            for name, config in [('model.config', FLAGS.model_config_path),
                                 ('train.config', FLAGS.train_config_path),
                                 ('input.config', FLAGS.input_config_path)]:
                tf.gfile.Copy(config,
                              os.path.join(FLAGS.train_dir, name),
                              overwrite=True)

    model_config = configs['model']
    train_config = configs['train_config']
    input_config = configs['train_input_config']

    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=True)

    def get_next(config):
        return dataset_util.make_initializable_iterator(
            dataset_builder.build(config)).get_next()

    create_input_dict_fn = functools.partial(get_next, input_config)

    env = json.loads(os.environ.get('TF_CONFIG', '{}'))
    print("%s" % str(env))
    cluster_data = env.get('cluster', None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    task_info = type('TaskSpec', (object, ), task_data)
    print("cluster_data %s" % str(cluster_data))
    # Parameters for a single worker.
    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''

    if cluster_data and 'worker' in cluster_data:
        # Number of total worker replicas include "worker"s and the "master".
        worker_replicas = len(cluster_data['worker']) + 1
    if cluster_data and 'ps' in cluster_data:
        ps_tasks = len(cluster_data['ps'])

    if worker_replicas > 1 and ps_tasks < 1:
        raise ValueError(
            'At least 1 ps task is needed for distributed training.')

    if worker_replicas >= 1 and ps_tasks > 0:
        # Set up distributed training.
        try:
            print("tf.train.Server")
            server = tf.train.Server(tf.train.ClusterSpec(cluster),
                                     protocol='grpc',
                                     job_name=task_info.type,
                                     task_index=task_info.index)
        except KeyboardInterrupt:
            print("ctrl c END")
        if task_info.type == 'ps':
            print("ps")
            try:
                print("tf.Session")
                sess = tf.Session(server.target)
                print("create_done_queue: " + str(worker_replicas))
                queue = create_done_queue(task_info.index, worker_replicas,
                                          ps_tasks)

                # wait until all workers are done
                for i in range(worker_replicas):
                    sess.run(queue.dequeue())
                    print("ps %d received done %d" % (task_info.index, i))

                print("ps %d: quitting" % (task_info.index))
                # server.join()
                return
            except KeyboardInterrupt:
                print("ctrl c END")

        worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
        task = task_info.index
        is_chief = (task_info.type == 'master')
        master = server.target
    print("is_chief:" + str(is_chief))

    graph_rewriter_fn = None
    if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
            configs['graph_rewriter_config'], is_training=True)

    try:
        trainer.train(create_input_dict_fn,
                      model_fn,
                      train_config,
                      master,
                      task,
                      FLAGS.num_clones,
                      worker_replicas,
                      FLAGS.clone_on_cpu,
                      ps_tasks,
                      worker_job_name,
                      is_chief,
                      FLAGS.train_dir,
                      graph_hook_fn=graph_rewriter_fn)
    except KeyboardInterrupt:
        print("ctrl c END1")
    finally:
        if worker_replicas >= 1 and ps_tasks > 0:
            print("tf.Session")
            sess = tf.Session(server.target)
            print("end create_done_queues:" + str(worker_replicas))
            for q in create_done_queues(worker_replicas, ps_tasks):
                print("enqueue")
                sess.run(q.enqueue(1))
示例#20
0
def main():
    configs = config_util.get_configs_from_pipeline_file(CONFIG_PATH)

    model_config = configs['model']
    train_config = configs['train_config']
    input_config = configs['train_input_config']

    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=True)

    def get_next(config):
        return dataset_builder.make_initializable_iterator(
            dataset_builder.build(config)).get_next()

    create_input_dict_fn = functools.partial(get_next, input_config)

    env = json.loads(os.environ.get('TF_CONFIG', '{}'))
    cluster_data = env.get('cluster', None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    task_info = type('TaskSpec', (object, ), task_data)

    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''

    if cluster_data and 'worker' in cluster_data:
        worker_replicas = len(cluster_data['worker']) + 1
    if cluster_data and 'ps' in cluster_data:
        ps_tasks = len(cluster_data['ps'])

    if worker_replicas > 1 and ps_tasks < 1:
        raise ValueError(
            'At least 1 ps task is needed for distributed training.')

    if worker_replicas >= 1 and ps_tasks > 0:
        server = tf.train.Server(tf.train.ClusterSpec(cluster),
                                 protocol='grpc',
                                 job_name=task_info.type,
                                 task_index=task_info.index)
        if task_info.type == 'ps':
            server.join()
            return

        worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
        task = task_info.index
        is_chief = (task_info.type == 'master')
        master = server.target

    graph_rewriter_fn = None
    if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
            configs['graph_rewriter_config'], is_training=True)

    trainer.train(create_input_dict_fn,
                  model_fn,
                  train_config,
                  master,
                  task,
                  NUM_CLONE,
                  worker_replicas,
                  CLONE_ON_CPU,
                  ps_tasks,
                  worker_job_name,
                  is_chief,
                  TRAINING_DIR,
                  graph_hook_fn=graph_rewriter_fn)
示例#21
0
def get_eval_config(eval_dir,
                    pipeline_config_path,
                    eval_config_path,
                    model_config_path,
                    eval_input_config_path,
                    eval_training_data,
                    run_once):
    """Set variables for evaluating.
    """
    tf.gfile.MakeDirs(eval_dir)
    
    if pipeline_config_path:
        configs = config_util.get_configs_from_pipeline_file(
            pipeline_config_path)
        tf.gfile.Copy(pipeline_config_path,
                      os.path.join(eval_dir, 'pipeline.config'),
                      overwrite=True)
    else:
        configs = config_util.get_configs_from_multiple_files(
            model_config_path=model_config_path,
            eval_config_path=eval_config_path,
            eval_input_config_path=eval_input_config_path)
        for name, config in [('model.config', model_config_path),
                             ('eval.config', eval_config_path),
                             ('input.config', eval_input_config_path)]:
            tf.gfile.Copy(config,
                        os.path.join(eval_dir, name),
                        overwrite=True)

    model_config = configs['model']
    eval_config = configs['eval_config']
    input_config = configs['eval_input_config']

    # Evaluation on Training Data
    if eval_training_data:
        input_config = configs['train_input_config']
        
    # Build model for Evaluation
    model_fn = functools.partial(
      model_builder.build,
      model_config=model_config,
      is_training=False)

    label_map = label_map_util.load_labelmap(input_config.label_map_path)
    max_num_classes = max([item.id for item in label_map.item])
    categories = label_map_util.convert_label_map_to_categories(
      label_map, max_num_classes)

    # Input fn
    create_input_dict_fn = functools.partial(get_next, input_config)

    # Single evaluation step
    if run_once:
        eval_config.max_evals = 1

    graph_rewriter_fn = None
    if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
            configs['graph_rewriter_config'], is_training=False)
        
    return(create_input_dict_fn,
           model_fn,
           eval_config,
           categories,
           graph_rewriter_fn)
示例#22
0
文件: train.py 项目: ALISCIFP/models
def main(_):
  assert FLAGS.train_dir, '`train_dir` is missing.'
  if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir)
  if FLAGS.pipeline_config_path:
    configs = config_util.get_configs_from_pipeline_file(
        FLAGS.pipeline_config_path)
    if FLAGS.task == 0:
      tf.gfile.Copy(FLAGS.pipeline_config_path,
                    os.path.join(FLAGS.train_dir, 'pipeline.config'),
                    overwrite=True)
  else:
    configs = config_util.get_configs_from_multiple_files(
        model_config_path=FLAGS.model_config_path,
        train_config_path=FLAGS.train_config_path,
        train_input_config_path=FLAGS.input_config_path)
    if FLAGS.task == 0:
      for name, config in [('model.config', FLAGS.model_config_path),
                           ('train.config', FLAGS.train_config_path),
                           ('input.config', FLAGS.input_config_path)]:
        tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name),
                      overwrite=True)

  model_config = configs['model']
  train_config = configs['train_config']
  input_config = configs['train_input_config']

  model_fn = functools.partial(
      model_builder.build,
      model_config=model_config,
      is_training=True)

  def get_next(config):
    return dataset_util.make_initializable_iterator(
        dataset_builder.build(config)).get_next()

  create_input_dict_fn = functools.partial(get_next, input_config)

  env = json.loads(os.environ.get('TF_CONFIG', '{}'))
  cluster_data = env.get('cluster', None)
  cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
  task_data = env.get('task', None) or {'type': 'master', 'index': 0}
  task_info = type('TaskSpec', (object,), task_data)

  # Parameters for a single worker.
  ps_tasks = 0
  worker_replicas = 1
  worker_job_name = 'lonely_worker'
  task = 0
  is_chief = True
  master = ''

  if cluster_data and 'worker' in cluster_data:
    # Number of total worker replicas include "worker"s and the "master".
    worker_replicas = len(cluster_data['worker']) + 1
  if cluster_data and 'ps' in cluster_data:
    ps_tasks = len(cluster_data['ps'])

  if worker_replicas > 1 and ps_tasks < 1:
    raise ValueError('At least 1 ps task is needed for distributed training.')

  if worker_replicas >= 1 and ps_tasks > 0:
    # Set up distributed training.
    server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc',
                             job_name=task_info.type,
                             task_index=task_info.index)
    if task_info.type == 'ps':
      server.join()
      return

    worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
    task = task_info.index
    is_chief = (task_info.type == 'master')
    master = server.target

  graph_rewriter_fn = None
  if 'graph_rewriter_config' in configs:
    graph_rewriter_fn = graph_rewriter_builder.build(
        configs['graph_rewriter_config'], is_training=True)

  trainer.train(
      create_input_dict_fn,
      model_fn,
      train_config,
      master,
      task,
      FLAGS.num_clones,
      worker_replicas,
      FLAGS.clone_on_cpu,
      ps_tasks,
      worker_job_name,
      is_chief,
      FLAGS.train_dir,
      graph_hook_fn=graph_rewriter_fn)
示例#23
0
def train_tensorflow_object_detection_api(base_config_path, to_save_path,
                                          dataset_path, num_steps):

    os.makedirs(to_save_path, exist_ok=True)

    if len(os.listdir(to_save_path)) == 0:

        do_xml_to_csv(to_save_path, dataset_path)
        generate_config(base_config_path, to_save_path, num_steps)
        generate_tfrecords(base_config_path, to_save_path, dataset_path)

    checkpoint_path = "{}/training".format(to_save_path)
    config_file_path = "{}/config/faster_rcnn_inception_v2_pets.config".format(
        to_save_path)

    tf.logging.set_verbosity(tf.logging.INFO)
    flags = tf.app.flags
    flags.DEFINE_string('f', '', 'kernel')
    flags.DEFINE_string('master', '', 'Name of the TensorFlow master to use.')
    flags.DEFINE_integer('task', 0, 'task id')
    flags.DEFINE_integer('num_clones', 1,
                         'Number of clones to deploy per worker.')
    flags.DEFINE_boolean(
        'clone_on_cpu', False,
        'Force clones to be deployed on CPU.  Note that even if '
        'set to False (allowing ops to run on gpu), some ops may '
        'still be run on the CPU if they have no GPU kernel.')
    flags.DEFINE_integer('worker_replicas', 1, 'Number of worker+trainer '
                         'replicas.')
    flags.DEFINE_integer(
        'ps_tasks', 0,
        'Number of parameter server tasks. If None, does not use '
        'a parameter server.')
    flags.DEFINE_string(
        'train_dir', '{}'.format(checkpoint_path),
        'Directory to save the checkpoints and training summaries.')
    flags.DEFINE_string(
        'pipeline_config_path', '{}'.format(config_file_path),
        'Path to a pipeline_pb2.TrainEvalPipelineConfig config '
        'file. If provided, other configs are ignored')

    flags.DEFINE_string('train_config_path', '',
                        'Path to a train_pb2.TrainConfig config file.')
    flags.DEFINE_string(
        'input_config_path', '',
        'Path to an input_reader_pb2.InputReader config file.')
    flags.DEFINE_string('model_config_path', '',
                        'Path to a model_pb2.DetectionModel config file.')

    FLAGS = flags.FLAGS

    assert FLAGS.train_dir, '`train_dir` is missing.'
    if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir)
    if FLAGS.pipeline_config_path:
        configs = config_util.get_configs_from_pipeline_file(
            FLAGS.pipeline_config_path)
        if FLAGS.task == 0:
            tf.gfile.Copy(FLAGS.pipeline_config_path,
                          os.path.join(FLAGS.train_dir, 'pipeline.config'),
                          overwrite=True)
    else:
        configs = config_util.get_configs_from_multiple_files(
            model_config_path=FLAGS.model_config_path,
            train_config_path=FLAGS.train_config_path,
            train_input_config_path=FLAGS.input_config_path)
        if FLAGS.task == 0:
            for name, config in [('model.config', FLAGS.model_config_path),
                                 ('train.config', FLAGS.train_config_path),
                                 ('input.config', FLAGS.input_config_path)]:
                tf.gfile.Copy(config,
                              os.path.join(FLAGS.train_dir, name),
                              overwrite=True)

    model_config = configs['model']
    train_config = configs['train_config']
    input_config = configs['train_input_config']

    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=True)

    def get_next(config):
        return dataset_builder.make_initializable_iterator(
            dataset_builder.build(config)).get_next()

    create_input_dict_fn = functools.partial(get_next, input_config)

    env = json.loads(os.environ.get('TF_CONFIG', '{}'))

    cluster_data = env.get('cluster', None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    task_info = type('TaskSpec', (object, ), task_data)

    # Parameters for a single worker.
    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''

    if cluster_data and 'worker' in cluster_data:
        # Number of total worker replicas include "worker"s and the "master".
        worker_replicas = len(cluster_data['worker']) + 1
    if cluster_data and 'ps' in cluster_data:
        ps_tasks = len(cluster_data['ps'])

    if worker_replicas > 1 and ps_tasks < 1:
        raise ValueError(
            'At least 1 ps task is needed for distributed training.')

    if worker_replicas >= 1 and ps_tasks > 0:
        # Set up distributed training.
        server = tf.train.Server(tf.train.ClusterSpec(cluster),
                                 protocol='grpc',
                                 job_name=task_info.type,
                                 task_index=task_info.index)
        if task_info.type == 'ps':
            server.join()
            return

        worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
        task = task_info.index
        is_chief = (task_info.type == 'master')
        master = server.target

    graph_rewriter_fn = None

    if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
            configs['graph_rewriter_config'], is_training=True)

    # get TF logger
    log = logging.getLogger('tensorflow')
    log.setLevel(logging.INFO)
    # create formatter and add it to the handlers
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    # create file handler which logs even debug messages
    fh = logging.FileHandler('{}training/training.log'.format(to_save_path))
    fh.setLevel(logging.INFO)
    fh.setFormatter(formatter)
    log.addHandler(fh)

    trainer.train(create_input_dict_fn,
                  model_fn,
                  train_config,
                  master,
                  task,
                  FLAGS.num_clones,
                  worker_replicas,
                  FLAGS.clone_on_cpu,
                  ps_tasks,
                  worker_job_name,
                  is_chief,
                  FLAGS.train_dir,
                  graph_hook_fn=graph_rewriter_fn)


# # tf.app.run(main=train_tensorflow_object_detection_api)
# base_config_path = '/home/aniruddh/lincode/product/livis-develop_v2/gpu_q/tf_training/'
# to_save_path = '/home/aniruddh/lincode/product/livis-develop_v2/experiments/tensorflow/t2/'
# dataset_path = '/home/aniruddh/lincode/product/images/'
# num_steps = 15000
# train_tensorflow_object_detection_api(base_config_path, to_save_path, dataset_path, num_steps)

# os.makedirs(to_save_path, exist_ok=True)

# do_xml_to_csv(to_save_path, dataset_path)
# config_path = generate_config(base_config_path, to_save_path, num_steps)
# generate_tfrecords(base_config_path, to_save_path, dataset_path)
# train_tf_model(base_config_path, to_save_path, dataset_path, num_steps)
示例#24
0
def main(unused_argv):
    assert FLAGS.checkpoint_dir, '`checkpoint_dir` is missing.'
    assert FLAGS.eval_dir, '`eval_dir` is missing.'
    tf.gfile.MakeDirs(FLAGS.eval_dir)
    if FLAGS.pipeline_config_path:
        configs = config_util.get_configs_from_pipeline_file(
            FLAGS.pipeline_config_path)
        tf.gfile.Copy(FLAGS.pipeline_config_path,
                      os.path.join(FLAGS.eval_dir, 'pipeline.config'),
                      overwrite=True)
    else:
        configs = config_util.get_configs_from_multiple_files(
            model_config_path=FLAGS.model_config_path,
            eval_config_path=FLAGS.eval_config_path,
            eval_input_config_path=FLAGS.input_config_path)
        for name, config in [('model.config', FLAGS.model_config_path),
                             ('eval.config', FLAGS.eval_config_path),
                             ('input.config', FLAGS.input_config_path)]:
            tf.gfile.Copy(config,
                          os.path.join(FLAGS.eval_dir, name),
                          overwrite=True)

    model_config = configs['model']
    eval_config = configs['eval_config']
    input_config = configs['eval_input_config']
    if FLAGS.eval_training_data:
        input_config = configs['train_input_config']

    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=False)

    def get_next(config):
        return dataset_builder.make_initializable_iterator(
            dataset_builder.build(config)).get_next()

    create_input_dict_fn = functools.partial(get_next, input_config)

    categories = label_map_util.create_categories_from_labelmap(
        input_config.label_map_path)

    if FLAGS.run_once:
        eval_config.max_evals = 1

    graph_rewriter_fn = None
    if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
            configs['graph_rewriter_config'], is_training=False)

    metrics_dict = evaluator.evaluate(create_input_dict_fn,
                                      model_fn,
                                      eval_config,
                                      categories,
                                      FLAGS.checkpoint_dir,
                                      FLAGS.eval_dir,
                                      graph_hook_fn=graph_rewriter_fn)

    with open(FLAGS.output_json_path, 'w') as op_json_file:
        temp_dict = {}
        for key, value in metrics_dict.items():
            temp_dict[key] = str(value)

        json.dump(temp_dict, op_json_file)
示例#25
0
def main(pipeline_config_path, checkpoint_dir, eval_dir, eval_training_data=False, 
        eval_config_path="", input_config_path="", 
        model_config_path="", run_once=False):
  """
  DEFINE_boolean('eval_training_data', False,
                       'If training data should be evaluated for this job.')
  DEFINE_string('checkpoint_dir', '',
                      'Directory containing checkpoints to evaluate, typically '
                      'set to `train_dir` used in the training job.')
  DEFINE_string('eval_dir', '',
                      'Directory to write eval summaries to.')
  DEFINE_string('pipeline_config_path', '',
                      'Path to a pipeline_pb2.TrainEvalPipelineConfig config '
                      'file. If provided, other configs are ignored')
  DEFINE_string('eval_config_path', '',
                      'Path to an eval_pb2.EvalConfig config file.')
  DEFINE_string('input_config_path', '',
                      'Path to an input_reader_pb2.InputReader config file.')
  DEFINE_string('model_config_path', '',
                      'Path to a model_pb2.DetectionModel config file.')
  DEFINE_boolean('run_once', False, 'Option to only run a single pass of '
                       'evaluation. Overrides the `max_evals` parameter in the '
                       'provided config.')
  """
  tf.logging.set_verbosity(tf.logging.INFO)
  tf.gfile.MakeDirs(eval_dir)
  if pipeline_config_path:
    configs = config_util.get_configs_from_pipeline_file(
        pipeline_config_path)
    tf.gfile.Copy(pipeline_config_path,
                  os.path.join(eval_dir, 'pipeline.config'),
                  overwrite=True)
  else:
    configs = config_util.get_configs_from_multiple_files(
        model_config_path=model_config_path,
        eval_config_path=eval_config_path,
        eval_input_config_path=input_config_path)
    for name, config in [('model.config', model_config_path),
                         ('eval.config', eval_config_path),
                         ('input.config', input_config_path)]:
      tf.gfile.Copy(config,
                    os.path.join(eval_dir, name),
                    overwrite=True)

  model_config = configs['model']
  eval_config = configs['eval_config']
  input_config = configs['eval_input_config']
  if eval_training_data:
    input_config = configs['train_input_config']

  model_fn = functools.partial(
      model_builder.build,
      model_config=model_config,
      is_training=False)

  def get_next(config):
    return dataset_builder.make_initializable_iterator(
        dataset_builder.build(config)).get_next()

  create_input_dict_fn = functools.partial(get_next, input_config)

  label_map = label_map_util.load_labelmap(input_config.label_map_path)
  max_num_classes = max([item.id for item in label_map.item])
  categories = label_map_util.convert_label_map_to_categories(
      label_map, max_num_classes)

  if run_once:
    eval_config.max_evals = 1

  graph_rewriter_fn = None
  if 'graph_rewriter_config' in configs:
    graph_rewriter_fn = graph_rewriter_builder.build(
        configs['graph_rewriter_config'], is_training=False)

  evaluator.evaluate(
      create_input_dict_fn,
      model_fn,
      eval_config,
      categories,
      checkpoint_dir,
      eval_dir,
      graph_hook_fn=graph_rewriter_fn)
示例#26
0
    def model_fn(features, labels, mode, params=None):
        """Constructs the object detection model.

        Args:
            features: Dictionary of feature tensors, returned from `input_fn`.
            labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL,
            otherwise None.
            mode: Mode key from tf.estimator.ModeKeys.
            params: Parameter dictionary passed from the estimator.

        Returns:
            An `EstimatorSpec` that encapsulates the model and its serving
            configurations.
        """
        params = params or {}
        total_loss, train_op, detections, export_outputs = None, None, None, None
        is_training = mode == tf.estimator.ModeKeys.TRAIN

        # Make sure to set the Keras learning phase. True during training,
        # False for inference.
        tf.keras.backend.set_learning_phase(is_training)
        detection_model = detection_model_fn(is_training=is_training,
                                             add_summaries=(not use_tpu))

        scaffold_fn = None
        scaffold = None
        eval_metric_ops = None

        if mode == tf.estimator.ModeKeys.TRAIN:
            # get the optimizer and global step:
            global_step = tf.train.get_or_create_global_step()
            training_optimizer, optimizer_summary_vars = optimizer_builder.build(
                train_config.optimizer)

            #get the trainable variables
            #trainable_variables = None
            include_variables = (train_config.update_trainable_variables
                                 if train_config.update_trainable_variables
                                 else None)
            exclude_variables = (train_config.freeze_variables
                                 if train_config.freeze_variables else None)
            trainable_variables = tf.contrib.framework.filter_variables(
                tf.trainable_variables(),
                include_patterns=include_variables,
                exclude_patterns=exclude_variables)

            #get the clip_gradients_value
            clip_gradients_value = None
            if train_config.gradient_clipping_by_norm > 0:
                clip_gradients_value = train_config.gradient_clipping_by_norm

            total_loss = 0.
            tower_grads = []
            with tf.variable_scope(tf.get_variable_scope()):
                feature_list, label_list = split_features_and_labels(
                    features, labels, train_config.GPU_num)
                for i in xrange(train_config.GPU_num):
                    with tf.device('/gpu:%d' % i):
                        with tf.name_scope('%s_%d' % ('tower', i)) as scope:
                            loss = tower_loss(scope=scope,
                                              features=feature_list[i],
                                              labels=label_list[i],
                                              detection_model=detection_model,
                                              train_config=train_config)
                            tf.get_variable_scope().reuse_variables()
                            grads = training_optimizer.compute_gradients(
                                loss=loss)
                            if isinstance(clip_gradients_value, float):
                                grads = clip_gradients_by_norm(
                                    grads, clip_gradients_value)
                            tower_grads.append(grads)
                            total_loss += loss
            total_loss /= train_config.GPU_num
            grad_avg = average_gradients(tower_grads)

            with tf.control_dependencies(
                    tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                apply_gradient_op = training_optimizer.apply_gradients(
                    grads_and_vars=grad_avg, global_step=global_step)

            train_op = apply_gradient_op

            if train_config.fine_tune_checkpoint:
                if not train_config.fine_tune_checkpoint_type:
                    # train_config.from_detection_checkpoint field is deprecated. For
                    # backward compatibility, set train_config.fine_tune_checkpoint_type
                    # based on train_config.from_detection_checkpoint.
                    if train_config.from_detection_checkpoint:
                        train_config.fine_tune_checkpoint_type = 'detection'
                    else:
                        train_config.fine_tune_checkpoint_type = 'classification'
                asg_map = detection_model.restore_map(
                    fine_tune_checkpoint_type=train_config.
                    fine_tune_checkpoint_type,
                    load_all_detection_checkpoint_vars=(
                        train_config.load_all_detection_checkpoint_vars))
                available_var_map = (
                    variables_helper.get_variables_available_in_checkpoint(
                        asg_map,
                        train_config.fine_tune_checkpoint,
                        include_global_step=False))
                if use_tpu:

                    def tpu_scaffold():
                        tf.train.init_from_checkpoint(
                            train_config.fine_tune_checkpoint,
                            available_var_map)
                        return tf.train.Scaffold()

                    scaffold_fn = tpu_scaffold
                else:
                    tf.train.init_from_checkpoint(
                        train_config.fine_tune_checkpoint, available_var_map)

        elif mode == tf.estimator.ModeKeys.EVAL:
            detection_model = detection_model_fn(is_training=is_training,
                                                 add_summaries=(not use_tpu))
            # For evaling on train data, it is necessary to check whether groundtruth
            # must be unpadded.
            #in mode == tf.estimator.ModeKeys.EVAL or mode == tf.estimator.ModeKeys.PREDICT, I explictly set the evaluation and prediction to run on CPU
            with tf.device('/cpu:1'):
                # training_optimizer, optimizer_summary_vars = optimizer_builder.build( train_config.optimizer )
                boxes_shape = (labels[fields.InputDataFields.
                                      groundtruth_boxes].get_shape().as_list())
                unpad_groundtruth_tensors = boxes_shape[
                    1] is not None and not use_tpu
                labels = unstack_batch(
                    labels,
                    unpad_groundtruth_tensors=unpad_groundtruth_tensors)

                gt_boxes_list = labels[
                    fields.InputDataFields.groundtruth_boxes]
                gt_classes_list = labels[
                    fields.InputDataFields.groundtruth_classes]
                gt_masks_list = None
                if fields.InputDataFields.groundtruth_instance_masks in labels:
                    gt_masks_list = labels[
                        fields.InputDataFields.groundtruth_instance_masks]
                gt_keypoints_list = None
                if fields.InputDataFields.groundtruth_keypoints in labels:
                    gt_keypoints_list = labels[
                        fields.InputDataFields.groundtruth_keypoints]
                gt_weights_list = None
                if fields.InputDataFields.groundtruth_weights in labels:
                    gt_weights_list = labels[
                        fields.InputDataFields.groundtruth_weights]
                gt_confidences_list = None
                if fields.InputDataFields.groundtruth_confidences in labels:
                    gt_confidences_list = labels[
                        fields.InputDataFields.groundtruth_confidences]
                gt_is_crowd_list = None
                if fields.InputDataFields.groundtruth_is_crowd in labels:
                    gt_is_crowd_list = labels[
                        fields.InputDataFields.groundtruth_is_crowd]
                detection_model.provide_groundtruth(
                    groundtruth_boxes_list=gt_boxes_list,
                    groundtruth_classes_list=gt_classes_list,
                    groundtruth_confidences_list=gt_confidences_list,
                    groundtruth_masks_list=gt_masks_list,
                    groundtruth_keypoints_list=gt_keypoints_list,
                    groundtruth_weights_list=gt_weights_list,
                    groundtruth_is_crowd_list=gt_is_crowd_list)

                training_optimizer, optimizer_summary_vars = optimizer_builder.build(
                    train_config.optimizer)

                preprocessed_images = features[fields.InputDataFields.image]
                if use_tpu and train_config.use_bfloat16:
                    with tf.contrib.tpu.bfloat16_scope():
                        prediction_dict = detection_model.predict(
                            preprocessed_images,
                            features[fields.InputDataFields.true_image_shape])
                    for k, v in prediction_dict.items():
                        if v.dtype == tf.bfloat16:
                            prediction_dict[k] = tf.cast(v, tf.float32)
                else:
                    prediction_dict = detection_model.predict(
                        preprocessed_images,
                        features[fields.InputDataFields.true_image_shape])

                detections = detection_model.postprocess(
                    prediction_dict,
                    features[fields.InputDataFields.true_image_shape])

                losses_dict = detection_model.loss(
                    prediction_dict,
                    features[fields.InputDataFields.true_image_shape])
                losses = [loss_tensor for loss_tensor in losses_dict.values()]
                if train_config.add_regularization_loss:
                    regularization_losses = detection_model.regularization_losses(
                    )
                    if regularization_losses:
                        regularization_loss = tf.add_n(
                            regularization_losses, name='regularization_loss')
                losses.append(regularization_loss)
                losses_dict['Loss/regularization_loss'] = regularization_loss
                total_loss = tf.add_n(losses, name='total_loss')
                losses_dict['Loss/total_loss'] = total_loss

                if 'graph_rewriter_config' in configs:
                    graph_rewriter_fn = graph_rewriter_builder.build(
                        configs['graph_rewriter_config'],
                        is_training=is_training)
                    graph_rewriter_fn()

                class_agnostic = (
                    fields.DetectionResultFields.detection_classes
                    not in detections)
                groundtruth = _prepare_groundtruth_for_eval(
                    detection_model, class_agnostic,
                    eval_input_config.max_number_of_boxes)
                use_original_images = fields.InputDataFields.original_image in features
                if use_original_images:
                    eval_images = features[
                        fields.InputDataFields.original_image]
                    true_image_shapes = tf.slice(
                        features[fields.InputDataFields.true_image_shape],
                        [0, 0], [-1, 3])
                    original_image_spatial_shapes = features[
                        fields.InputDataFields.original_image_spatial_shape]
                else:
                    eval_images = features[fields.InputDataFields.image]
                    true_image_shapes = None
                    original_image_spatial_shapes = None

                eval_dict = eval_util.result_dict_for_batched_example(
                    eval_images,
                    features[inputs.HASH_KEY],
                    detections,
                    groundtruth,
                    class_agnostic=class_agnostic,
                    scale_to_absolute=True,
                    original_image_spatial_shapes=original_image_spatial_shapes,
                    true_image_shapes=true_image_shapes)

                if class_agnostic:
                    category_index = label_map_util.create_class_agnostic_category_index(
                    )
                else:
                    category_index = label_map_util.create_category_index_from_labelmap(
                        eval_input_config.label_map_path)
                vis_metric_ops = None
                if not use_tpu and use_original_images:
                    eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections(
                        category_index,
                        max_examples_to_draw=eval_config.num_visualizations,
                        max_boxes_to_draw=eval_config.
                        max_num_boxes_to_visualize,
                        min_score_thresh=eval_config.min_score_threshold,
                        use_normalized_coordinates=False)
                    vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops(
                        eval_dict)

                # Eval metrics on a single example.
                eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
                    eval_config, category_index.values(), eval_dict)
                for loss_key, loss_tensor in iter(losses_dict.items()):
                    eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
                for var in optimizer_summary_vars:
                    eval_metric_ops[var.op.name] = (var, tf.no_op())
                if vis_metric_ops is not None:
                    eval_metric_ops.update(vis_metric_ops)
                eval_metric_ops = {
                    str(k): v
                    for k, v in eval_metric_ops.items()
                }

                if eval_config.use_moving_averages:
                    variable_averages = tf.train.ExponentialMovingAverage(0.0)
                    variables_to_restore = variable_averages.variables_to_restore(
                    )
                    keep_checkpoint_every_n_hours = (
                        train_config.keep_checkpoint_every_n_hours)
                    saver = tf.train.Saver(variables_to_restore,
                                           keep_checkpoint_every_n_hours=
                                           keep_checkpoint_every_n_hours)
                    scaffold = tf.train.Scaffold(saver=saver)

        elif mode == tf.estimator.ModeKeys.PREDICT:
            detection_model = detection_model_fn(is_training=is_training,
                                                 add_summaries=(not use_tpu))
            #similar to EVAL mode, I run PREDICT on CPU too.
            with tf.device(':/cpu:1'):
                preprocessed_images = features[fields.InputDataFields.image]

                if use_tpu and train_config.use_bfloat16:
                    with tf.contrib.tpu.bfloat16_scope():
                        prediction_dict = detection_model.predict(
                            preprocessed_images,
                            features[fields.InputDataFields.true_image_shape])
                        for k, v in prediction_dict.items():
                            if v.dtype == tf.bfloat16:
                                prediction_dict[k] = tf.cast(v, tf.float32)
                else:
                    prediction_dict = detection_model.predict(
                        preprocessed_images,
                        features[fields.InputDataFields.true_image_shape])

                detections = detection_model.postprocess(
                    prediction_dict,
                    features[fields.InputDataFields.true_image_shape])

                exported_output = exporter_lib.add_output_tensor_nodes(
                    detections)
                export_outputs = {
                    tf.saved_model.signature_constants.PREDICT_METHOD_NAME:
                    tf.estimator.export.PredictOutput(exported_output)
                }

        # EVAL executes on CPU, so use regular non-TPU EstimatorSpec.
        if use_tpu and mode != tf.estimator.ModeKeys.EVAL:
            return tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                scaffold_fn=scaffold_fn,
                predictions=detections,
                loss=total_loss,
                train_op=train_op,
                eval_metrics=eval_metric_ops,
                export_outputs=export_outputs)
        else:
            #scafold here only contains Saver
            if scaffold is None:
                keep_checkpoint_every_n_hours = (
                    train_config.keep_checkpoint_every_n_hours)
                saver = tf.train.Saver(
                    sharded=True,
                    keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
                    save_relative_paths=True)
                tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
                scaffold = tf.train.Scaffold(saver=saver)

            return tf.estimator.EstimatorSpec(mode=mode,
                                              predictions=detections,
                                              loss=total_loss,
                                              train_op=train_op,
                                              eval_metric_ops=eval_metric_ops,
                                              export_outputs=export_outputs,
                                              scaffold=scaffold)
示例#27
0
  def model_fn(features, labels, mode, params=None):
    """Constructs the object detection model.

    Args:
      features: Dictionary of feature tensors, returned from `input_fn`.
      labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL,
        otherwise None.
      mode: Mode key from tf.estimator.ModeKeys.
      params: Parameter dictionary passed from the estimator.

    Returns:
      An `EstimatorSpec` that encapsulates the model and its serving
        configurations.
    """
    params = params or {}
    total_loss, train_op, detections, export_outputs = None, None, None, None
    is_training = mode == tf.estimator.ModeKeys.TRAIN

    # Make sure to set the Keras learning phase. True during training,
    # False for inference.
    tf.keras.backend.set_learning_phase(is_training)
    detection_model = detection_model_fn(
        is_training=is_training, add_summaries=(not use_tpu))
    scaffold_fn = None

    if mode == tf.estimator.ModeKeys.TRAIN:
      labels = unstack_batch(
          labels,
          unpad_groundtruth_tensors=train_config.unpad_groundtruth_tensors)
    elif mode == tf.estimator.ModeKeys.EVAL:
      # For evaling on train data, it is necessary to check whether groundtruth
      # must be unpadded.
      boxes_shape = (
          labels[fields.InputDataFields.groundtruth_boxes].get_shape()
          .as_list())
      unpad_groundtruth_tensors = boxes_shape[1] is not None and not use_tpu
      labels = unstack_batch(
          labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)

    if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
      gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes]
      gt_classes_list = labels[fields.InputDataFields.groundtruth_classes]
      gt_masks_list = None
      if fields.InputDataFields.groundtruth_instance_masks in labels:
        gt_masks_list = labels[
            fields.InputDataFields.groundtruth_instance_masks]
      gt_keypoints_list = None
      if fields.InputDataFields.groundtruth_keypoints in labels:
        gt_keypoints_list = labels[fields.InputDataFields.groundtruth_keypoints]
      gt_weights_list = None
      if fields.InputDataFields.groundtruth_weights in labels:
        gt_weights_list = labels[fields.InputDataFields.groundtruth_weights]
      gt_confidences_list = None
      if fields.InputDataFields.groundtruth_confidences in labels:
        gt_confidences_list = labels[
            fields.InputDataFields.groundtruth_confidences]
      gt_is_crowd_list = None
      if fields.InputDataFields.groundtruth_is_crowd in labels:
        gt_is_crowd_list = labels[fields.InputDataFields.groundtruth_is_crowd]
      detection_model.provide_groundtruth(
          groundtruth_boxes_list=gt_boxes_list,
          groundtruth_classes_list=gt_classes_list,
          groundtruth_confidences_list=gt_confidences_list,
          groundtruth_masks_list=gt_masks_list,
          groundtruth_keypoints_list=gt_keypoints_list,
          groundtruth_weights_list=gt_weights_list,
          groundtruth_is_crowd_list=gt_is_crowd_list)

    preprocessed_images = features[fields.InputDataFields.image]
    if use_tpu and train_config.use_bfloat16:
      with tf.contrib.tpu.bfloat16_scope():
        prediction_dict = detection_model.predict(
            preprocessed_images,
            features[fields.InputDataFields.true_image_shape])
        for k, v in prediction_dict.items():
          if v.dtype == tf.bfloat16:
            prediction_dict[k] = tf.cast(v, tf.float32)
    else:
      prediction_dict = detection_model.predict(
          preprocessed_images,
          features[fields.InputDataFields.true_image_shape])

    def postprocess_wrapper(args):
      return detection_model.postprocess(args[0], args[1])

    if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT):
      if use_tpu and postprocess_on_cpu:
        detections = tf.contrib.tpu.outside_compilation(
            postprocess_wrapper,
            (prediction_dict,
             features[fields.InputDataFields.true_image_shape]))
      else:
        detections = postprocess_wrapper((
            prediction_dict,
            features[fields.InputDataFields.true_image_shape]))

    if mode == tf.estimator.ModeKeys.TRAIN:
      if train_config.fine_tune_checkpoint and hparams.load_pretrained:
        if not train_config.fine_tune_checkpoint_type:
          # train_config.from_detection_checkpoint field is deprecated. For
          # backward compatibility, set train_config.fine_tune_checkpoint_type
          # based on train_config.from_detection_checkpoint.
          if train_config.from_detection_checkpoint:
            train_config.fine_tune_checkpoint_type = 'detection'
          else:
            train_config.fine_tune_checkpoint_type = 'classification'
        asg_map = detection_model.restore_map(
            fine_tune_checkpoint_type=train_config.fine_tune_checkpoint_type,
            load_all_detection_checkpoint_vars=(
                train_config.load_all_detection_checkpoint_vars))
        available_var_map = (
            variables_helper.get_variables_available_in_checkpoint(
                asg_map,
                train_config.fine_tune_checkpoint,
                include_global_step=False))
        if use_tpu:

          def tpu_scaffold():
            tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint,
                                          available_var_map)
            return tf.train.Scaffold()

          scaffold_fn = tpu_scaffold
        else:
          tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint,
                                        available_var_map)

    if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
      losses_dict = detection_model.loss(
          prediction_dict, features[fields.InputDataFields.true_image_shape])
      losses = [loss_tensor for loss_tensor in losses_dict.values()]
      if train_config.add_regularization_loss:
        regularization_losses = detection_model.regularization_losses()
        if regularization_losses:
          regularization_loss = tf.add_n(
              regularization_losses, name='regularization_loss')
          losses.append(regularization_loss)
          losses_dict['Loss/regularization_loss'] = regularization_loss
      total_loss = tf.add_n(losses, name='total_loss')
      losses_dict['Loss/total_loss'] = total_loss

      if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
            configs['graph_rewriter_config'], is_training=is_training)
        graph_rewriter_fn()

      # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we
      # can write learning rate summaries on TPU without host calls.
      global_step = tf.train.get_or_create_global_step()
      training_optimizer, optimizer_summary_vars = optimizer_builder.build(
          train_config.optimizer)

    if mode == tf.estimator.ModeKeys.TRAIN:
      if use_tpu:
        training_optimizer = tf.contrib.tpu.CrossShardOptimizer(
            training_optimizer)

      # Optionally freeze some layers by setting their gradients to be zero.
      trainable_variables = None
      include_variables = (
          train_config.update_trainable_variables
          if train_config.update_trainable_variables else None)
      exclude_variables = (
          train_config.freeze_variables
          if train_config.freeze_variables else None)
      trainable_variables = tf.contrib.framework.filter_variables(
          tf.trainable_variables(),
          include_patterns=include_variables,
          exclude_patterns=exclude_variables)

      clip_gradients_value = None
      if train_config.gradient_clipping_by_norm > 0:
        clip_gradients_value = train_config.gradient_clipping_by_norm

      if not use_tpu:
        for var in optimizer_summary_vars:
          tf.summary.scalar(var.op.name, var)
      summaries = [] if use_tpu else None
      if train_config.summarize_gradients:
        summaries = ['gradients', 'gradient_norm', 'global_gradient_norm']
      train_op = tf.contrib.layers.optimize_loss(
          loss=total_loss,
          global_step=global_step,
          learning_rate=None,
          clip_gradients=clip_gradients_value,
          optimizer=training_optimizer,
          update_ops=detection_model.updates(),
          variables=trainable_variables,
          summaries=summaries,
          name='')  # Preventing scope prefix on all variables.

    if mode == tf.estimator.ModeKeys.PREDICT:
      exported_output = exporter_lib.add_output_tensor_nodes(detections)
      export_outputs = {
          tf.saved_model.signature_constants.PREDICT_METHOD_NAME:
              tf.estimator.export.PredictOutput(exported_output)
      }

    eval_metric_ops = None
    scaffold = None
    if mode == tf.estimator.ModeKeys.EVAL:
      class_agnostic = (
          fields.DetectionResultFields.detection_classes not in detections)
      groundtruth = _prepare_groundtruth_for_eval(
          detection_model, class_agnostic,
          eval_input_config.max_number_of_boxes)
      use_original_images = fields.InputDataFields.original_image in features
      if use_original_images:
        eval_images = features[fields.InputDataFields.original_image]
        true_image_shapes = tf.slice(
            features[fields.InputDataFields.true_image_shape], [0, 0], [-1, 3])
        original_image_spatial_shapes = features[fields.InputDataFields
                                                 .original_image_spatial_shape]
      else:
        eval_images = features[fields.InputDataFields.image]
        true_image_shapes = None
        original_image_spatial_shapes = None

      eval_dict = eval_util.result_dict_for_batched_example(
          eval_images,
          features[inputs.HASH_KEY],
          detections,
          groundtruth,
          class_agnostic=class_agnostic,
          scale_to_absolute=True,
          original_image_spatial_shapes=original_image_spatial_shapes,
          true_image_shapes=true_image_shapes)

      if class_agnostic:
        category_index = label_map_util.create_class_agnostic_category_index()
      else:
        category_index = label_map_util.create_category_index_from_labelmap(
            eval_input_config.label_map_path)
      vis_metric_ops = None
      if not use_tpu and use_original_images:
        eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections(
            category_index,
            max_examples_to_draw=eval_config.num_visualizations,
            max_boxes_to_draw=eval_config.max_num_boxes_to_visualize,
            min_score_thresh=eval_config.min_score_threshold,
            use_normalized_coordinates=False)
        vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops(
            eval_dict)

      # Eval metrics on a single example.
      eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
          eval_config, list(category_index.values()), eval_dict)
      for loss_key, loss_tensor in iter(losses_dict.items()):
        eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
      for var in optimizer_summary_vars:
        eval_metric_ops[var.op.name] = (var, tf.no_op())
      if vis_metric_ops is not None:
        eval_metric_ops.update(vis_metric_ops)
      eval_metric_ops = {str(k): v for k, v in eval_metric_ops.items()}

      if eval_config.use_moving_averages:
        variable_averages = tf.train.ExponentialMovingAverage(0.0)
        variables_to_restore = variable_averages.variables_to_restore()
        keep_checkpoint_every_n_hours = (
            train_config.keep_checkpoint_every_n_hours)
        saver = tf.train.Saver(
            variables_to_restore,
            keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours)
        scaffold = tf.train.Scaffold(saver=saver)

    # EVAL executes on CPU, so use regular non-TPU EstimatorSpec.
    if use_tpu and mode != tf.estimator.ModeKeys.EVAL:
      return tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          scaffold_fn=scaffold_fn,
          predictions=detections,
          loss=total_loss,
          train_op=train_op,
          eval_metrics=eval_metric_ops,
          export_outputs=export_outputs)
    else:
      if scaffold is None:
        keep_checkpoint_every_n_hours = (
            train_config.keep_checkpoint_every_n_hours)
        saver = tf.train.Saver(
            sharded=True,
            keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
            save_relative_paths=True)
        tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
        scaffold = tf.train.Scaffold(saver=saver)
      return tf.estimator.EstimatorSpec(
          mode=mode,
          predictions=detections,
          loss=total_loss,
          train_op=train_op,
          eval_metric_ops=eval_metric_ops,
          export_outputs=export_outputs,
          scaffold=scaffold)
示例#28
0
    def start_training(self):
        """Start training for the model"""
        worker_replicas = 1
        ps_tasks = 0
        clone_on_cpu = False
        num_clones = 1

        ensure_path(config.BASE_MODELS_PATH)
        train_dir = self.train_dir
        model_json_path = os.path.join(train_dir, 'job.json')

        job = self.job
        num_steps = int(job['steps'])

        try:
            if config.DEBUG:
                num_steps = 50
        except AttributeError:
            pass
        except Exception as e:
            _LOGGER.error(e)

        job = api.update_job_state(job, 'training', 'Start training for {} steps'.format(num_steps))

        model = self.model
        ensure_path(config.EXPORTED_MODELS)
        model_graph = os.path.join(config.EXPORTED_MODELS, '{}.pb'.format(model['file_name']))

        if not os.path.exists(os.path.join(train_dir, 'checkpoint')):  # New training started
            _LOGGER.debug("Checkpoints doesn't exists")

            base_checkpoints_path = os.path.join(config.BASE_MODELS_PATH, model['architecture'])
            _tmf = os.path.join(config.TRAINED_MODELS_DATA, model['file_name'])
            if os.path.isdir(_tmf):
                _LOGGER.debug("Model already exists as %s" % model_graph)
                base_checkpoints_path = _tmf
            elif model['type'] == 'new':
                _LOGGER.debug("model type new")
            else:
                _LOGGER.debug("New model from parent model")
                parent_model = api.get_model(model['parent'])
                if not parent_model:
                    raise Exception('Parent model not found on server')

                parent_tmf = os.path.join(config.TRAINED_MODELS_DATA, parent_model['file_name'])
                if os.path.isdir(parent_tmf):
                    base_checkpoints_path = parent_tmf
                else:
                    _LOGGER.error("Parent model not found. please train it first")
                    return False

            if not os.path.exists(os.path.join(base_checkpoints_path, 'model.ckpt.meta')):
                _LOGGER.debug("Base model not found for %s, Downloading now." % model['architecture'])
                _f = api.download_model_files(model['architecture'])

                tmp_model_data = os.path.join(config.DATA_DIR, 'tmp_model_data')
                if tarfile.is_tarfile(_f):
                    if os.path.exists(tmp_model_data):
                        shutil.rmtree(tmp_model_data)
                    ensure_path(tmp_model_data)
                    print("Tar file found")
                    shutil.unpack_archive(_f, tmp_model_data)
                    for root, dirs, files in os.walk(tmp_model_data):
                        for file in files:
                            if 'model.ckpt' in file:
                                path = os.path.join(root, file)
                                # print(path)
                                ensure_path(base_checkpoints_path)
                                shutil.copy(path, os.path.join(base_checkpoints_path, file))
                else:
                    _LOGGER.error("Invalid file")
                    return False
            if os.path.exists(train_dir):
                shutil.rmtree(train_dir)
            shutil.copytree(base_checkpoints_path, train_dir)
            if os.path.exists(os.path.join(train_dir, 'checkpoint')):
                os.remove(os.path.join(train_dir, 'checkpoint'))

        if os.path.exists(os.path.join(train_dir, 'data')):
            shutil.rmtree(os.path.join(train_dir, 'data'))
        shutil.copytree(self.data_dir, os.path.join(train_dir, 'data'))

        counts = {'train': 0, 'test': 1000, 'classes': 1}
        stats_file = os.path.join(train_dir, "data", "stats.json")
        try:
            with open(stats_file) as _f:
                counts = json.load(_f)
        except:
            pass

        pipeline_config_path = os.path.join(train_dir, 'pipeline.config')
        if not os.path.exists(pipeline_config_path):
            pipeline_config_path = os.path.join(self.configs_dir, "{}.config".format(model['architecture']))
        task = '0'
        if task == '0':
            tf.gfile.MakeDirs(train_dir)
        if pipeline_config_path:
            _LOGGER.info("Pipeline config file : {}".format(pipeline_config_path))
            configs = config_util.get_configs_from_pipeline_file(
                pipeline_config_path)
            if task == '0':
                tf.gfile.Copy(pipeline_config_path,
                              os.path.join(train_dir, 'pipeline.config'),
                              overwrite=True)
        else:
            _LOGGER.error("No config found")
            return False

        pipeline_config_path = os.path.join(train_dir, 'pipeline.config')

        # with open(model_json_path, 'w') as mf:
        #     json.dump(job, mf)

        model_config = configs['model']
        train_config = configs['train_config']
        input_config = configs['train_input_config']


        if model_config.HasField('faster_rcnn'):
            model_config.faster_rcnn.num_classes = counts['classes']

        if model_config.HasField('ssd'):
            model_config.ssd.num_classes = counts['classes']

        # Set num_steps
        train_config.num_steps = num_steps
        train_config.fine_tune_checkpoint = os.path.join(train_dir, 'model.ckpt')

        # Update input config to use updated list of input
        input_config.tf_record_input_reader.ClearField('input_path')
        input_config.tf_record_input_reader.input_path.append(os.path.join(train_dir, 'data', "train_baheads.tfrecord-??????"))
        input_config.label_map_path = os.path.join(train_dir, 'data', "labels.pbtxt")

        eval_config = configs['eval_config']
        eval_input_config = configs['eval_input_config']

        eval_config.num_examples = counts['test']
        eval_config.max_evals = 1

        # Update input config to use updated list of input
        eval_input_config.tf_record_input_reader.ClearField('input_path')
        eval_input_config.tf_record_input_reader.input_path.append(os.path.join(train_dir, 'data', "test_baheads.tfrecord-??????"))
        eval_input_config.label_map_path = os.path.join(train_dir, 'data', "labels.pbtxt")

        # Save the updated config to pipeline file
        config_util.save_pipeline_config(config_util.create_pipeline_proto_from_configs({
            'model': model_config,
            'train_config': train_config,
            'train_input_config': input_config,
            'eval_config': eval_config,
            'eval_input_config': eval_input_config

        }), train_dir)

        model_fn = functools.partial(
            model_builder.build,
            model_config=model_config,
            is_training=True)

        def get_next(config):
            return dataset_builder.make_initializable_iterator(
                dataset_builder.build(config)).get_next()

        create_input_dict_fn = functools.partial(get_next, input_config)

        env = json.loads(os.environ.get('TF_CONFIG', '{}'))
        cluster_data = env.get('cluster', None)
        cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
        task_data = env.get('task', None) or {'type': 'master', 'index': 0}
        task_info = type('TaskSpec', (object,), task_data)

        # Parameters for a single worker.
        ps_tasks = 0
        worker_replicas = 1
        worker_job_name = 'lonely_worker'
        task = 0
        is_chief = True
        master = ''

        if cluster_data and 'worker' in cluster_data:
            # Number of total worker replicas include "worker"s and the "master".
            worker_replicas = len(cluster_data['worker']) + 1
        if cluster_data and 'ps' in cluster_data:
            ps_tasks = len(cluster_data['ps'])

        if worker_replicas > 1 and ps_tasks < 1:
            raise ValueError('At least 1 ps task is needed for distributed training.')

        if worker_replicas >= 1 and ps_tasks > 0:
            # Set up distributed training.
            server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc',
                                     job_name=task_info.type,
                                     task_index=task_info.index)
            if task_info.type == 'ps':
                server.join()
                return

            worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
            task = task_info.index
            is_chief = (task_info.type == 'master')
            master = server.target

        graph_rewriter_fn = None
        if 'graph_rewriter_config' in configs:
            graph_rewriter_fn = graph_rewriter_builder.build(
                configs['graph_rewriter_config'], is_training=True)

        if not os.path.exists(os.path.join(train_dir, 'model.ckpt-{}.meta'.format(num_steps))):
            status_timer = StatusThread(tfh, num_steps, job)
            status_timer.start()
            try:
                trainer.train(
                    create_input_dict_fn,
                    model_fn,
                    train_config,
                    master,
                    task,
                    num_clones,
                    worker_replicas,
                    clone_on_cpu,
                    ps_tasks,
                    worker_job_name,
                    is_chief,
                    train_dir,
                    graph_hook_fn=graph_rewriter_fn)
            except KeyboardInterrupt:
                raise
            finally:
                status_timer.stop()
                if status_timer.is_alive():
                    _LOGGER.info("Waiting for status thread to close")
                    status_timer.join()

        if os.path.exists(os.path.join(train_dir, 'model.ckpt-{}.meta'.format(num_steps))):
            # Training complete. Export model
            _LOGGER.debug("Training complete for %d steps" % num_steps)
            job = api.update_job_state(job, 'training', 'Training complete')
            export_path = os.path.join(config.TRAINED_MODELS_DATA, model['file_name'])
            if os.path.exists(export_path):
                shutil.rmtree(export_path)
            ckpt_path = os.path.join(train_dir, 'model.ckpt-{}'.format(num_steps))
            exporter.export(pipeline_config_path, export_path, ckpt_path)

            frozen_graph = os.path.join(export_path, 'frozen_inference_graph.pb')

            if os.path.exists(frozen_graph):  # Successfully exported
                shutil.copy(frozen_graph, model_graph)
                shutil.copy(
                    os.path.join(train_dir, 'data', "labels.pbtxt"),
                    os.path.join(config.EXPORTED_MODELS, '{}.pbtxt'.format(model['file_name']))
                )
                # TODO: Eval the trained graph, Push the result to server.
                eval_dir = 'eval_dir'
                tf.reset_default_graph()
                eval_result = run_eval(train_dir, eval_dir, pipeline_config_path, counts['test'])
                if 'PascalBoxes_Precision/[email protected]' in eval_result:
                    acc = eval_result['PascalBoxes_Precision/[email protected]'] * 100
                    _LOGGER.info("PascalBoxes_Precision/[email protected] : %d %%" % (acc))
                    job = api.update_job_state(job, 'complete', 'PascalBoxes_Precision %d %%' % (acc))
                _LOGGER.info(eval_result)
                if os.path.exists(train_dir):
                    shutil.rmtree(train_dir)
                return True

        return False
示例#29
0
    def model_fn(features, labels, mode, params=None):
        """Constructs the object detection model.

    Args:
      features: Dictionary of feature tensors, returned from `input_fn`.
      labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL,
        otherwise None.
      mode: Mode key from tf.estimator.ModeKeys.
      params: Parameter dictionary passed from the estimator.

    Returns:
      An `EstimatorSpec` that encapsulates the model and its serving
        configurations.
    """
        params = params or {}
        total_loss, train_op, detections, export_outputs = None, None, None, None
        is_training = mode == tf.estimator.ModeKeys.TRAIN

        # Make sure to set the Keras learning phase. True during training,
        # False for inference.
        tf.keras.backend.set_learning_phase(is_training)
        detection_model = detection_model_fn(is_training=is_training,
                                             add_summaries=(not use_tpu))
        scaffold_fn = None

        if mode == tf.estimator.ModeKeys.TRAIN:
            labels = unstack_batch(labels,
                                   unpad_groundtruth_tensors=train_config.
                                   unpad_groundtruth_tensors)
        elif mode == tf.estimator.ModeKeys.EVAL:
            # For evaling on train data, it is necessary to check whether groundtruth
            # must be unpadded.
            boxes_shape = (labels[fields.InputDataFields.groundtruth_boxes].
                           get_shape().as_list())
            unpad_groundtruth_tensors = True if boxes_shape[
                1] is not None else False
            labels = unstack_batch(
                labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)

        if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
            gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes]
            gt_classes_list = labels[
                fields.InputDataFields.groundtruth_classes]
            gt_masks_list = None
            if fields.InputDataFields.groundtruth_instance_masks in labels:
                gt_masks_list = labels[
                    fields.InputDataFields.groundtruth_instance_masks]
            gt_keypoints_list = None
            if fields.InputDataFields.groundtruth_keypoints in labels:
                gt_keypoints_list = labels[
                    fields.InputDataFields.groundtruth_keypoints]
            gt_weights_list = None
            if fields.InputDataFields.groundtruth_weights in labels:
                gt_weights_list = labels[
                    fields.InputDataFields.groundtruth_weights]
            if fields.InputDataFields.groundtruth_is_crowd in labels:
                gt_is_crowd_list = labels[
                    fields.InputDataFields.groundtruth_is_crowd]
            detection_model.provide_groundtruth(
                groundtruth_boxes_list=gt_boxes_list,
                groundtruth_classes_list=gt_classes_list,
                groundtruth_masks_list=gt_masks_list,
                groundtruth_keypoints_list=gt_keypoints_list,
                groundtruth_weights_list=gt_weights_list,
                groundtruth_is_crowd_list=gt_is_crowd_list)

        preprocessed_images = features[fields.InputDataFields.image]
        prediction_dict = detection_model.predict(
            preprocessed_images,
            features[fields.InputDataFields.true_image_shape])
        if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT):
            detections = detection_model.postprocess(
                prediction_dict,
                features[fields.InputDataFields.true_image_shape])

        if mode == tf.estimator.ModeKeys.TRAIN:
            if train_config.fine_tune_checkpoint and hparams.load_pretrained:
                if not train_config.fine_tune_checkpoint_type:
                    # train_config.from_detection_checkpoint field is deprecated. For
                    # backward compatibility, set train_config.fine_tune_checkpoint_type
                    # based on train_config.from_detection_checkpoint.
                    if train_config.from_detection_checkpoint:
                        train_config.fine_tune_checkpoint_type = 'detection'
                    else:
                        train_config.fine_tune_checkpoint_type = 'classification'
                asg_map = detection_model.restore_map(
                    fine_tune_checkpoint_type=train_config.
                    fine_tune_checkpoint_type,
                    load_all_detection_checkpoint_vars=(
                        train_config.load_all_detection_checkpoint_vars))
                available_var_map = (
                    variables_helper.get_variables_available_in_checkpoint(
                        asg_map,
                        train_config.fine_tune_checkpoint,
                        include_global_step=False))
                if use_tpu:

                    def tpu_scaffold():
                        tf.train.init_from_checkpoint(
                            train_config.fine_tune_checkpoint,
                            available_var_map)
                        return tf.train.Scaffold()

                    scaffold_fn = tpu_scaffold
                else:
                    tf.train.init_from_checkpoint(
                        train_config.fine_tune_checkpoint, available_var_map)

        if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
            losses_dict = detection_model.loss(
                prediction_dict,
                features[fields.InputDataFields.true_image_shape])
            losses = [loss_tensor for loss_tensor in losses_dict.values()]
            if train_config.add_regularization_loss:
                regularization_losses = tf.get_collection(
                    tf.GraphKeys.REGULARIZATION_LOSSES)
                if regularization_losses:
                    regularization_loss = tf.add_n(regularization_losses,
                                                   name='regularization_loss')
                    losses.append(regularization_loss)
                    losses_dict[
                        'Loss/regularization_loss'] = regularization_loss
            total_loss = tf.add_n(losses, name='total_loss')
            losses_dict['Loss/total_loss'] = total_loss

            if 'graph_rewriter_config' in configs:
                graph_rewriter_fn = graph_rewriter_builder.build(
                    configs['graph_rewriter_config'], is_training=is_training)
                graph_rewriter_fn()

            # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we
            # can write learning rate summaries on TPU without host calls.
            global_step = tf.train.get_or_create_global_step()
            training_optimizer, optimizer_summary_vars = optimizer_builder.build(
                train_config.optimizer)

        if mode == tf.estimator.ModeKeys.TRAIN:
            if use_tpu:
                training_optimizer = tf.contrib.tpu.CrossShardOptimizer(
                    training_optimizer)

            # Optionally freeze some layers by setting their gradients to be zero.
            trainable_variables = None
            include_variables = (train_config.update_trainable_variables
                                 if train_config.update_trainable_variables
                                 else None)
            exclude_variables = (train_config.freeze_variables
                                 if train_config.freeze_variables else None)
            trainable_variables = tf.contrib.framework.filter_variables(
                tf.trainable_variables(),
                include_patterns=include_variables,
                exclude_patterns=exclude_variables)

            clip_gradients_value = None
            if train_config.gradient_clipping_by_norm > 0:
                clip_gradients_value = train_config.gradient_clipping_by_norm

            if not use_tpu:
                for var in optimizer_summary_vars:
                    tf.summary.scalar(var.op.name, var)
            summaries = [] if use_tpu else None
            train_op = tf.contrib.layers.optimize_loss(
                loss=total_loss,
                global_step=global_step,
                learning_rate=None,
                clip_gradients=clip_gradients_value,
                optimizer=training_optimizer,
                variables=trainable_variables,
                summaries=summaries,
                name='')  # Preventing scope prefix on all variables.

        if mode == tf.estimator.ModeKeys.PREDICT:
            export_outputs = {
                tf.saved_model.signature_constants.PREDICT_METHOD_NAME:
                tf.estimator.export.PredictOutput(detections)
            }

        eval_metric_ops = None
        scaffold = None
        if mode == tf.estimator.ModeKeys.EVAL:
            class_agnostic = (fields.DetectionResultFields.detection_classes
                              not in detections)
            groundtruth = _prepare_groundtruth_for_eval(
                detection_model, class_agnostic)
            use_original_images = fields.InputDataFields.original_image in features
            eval_images = (features[fields.InputDataFields.original_image]
                           if use_original_images else
                           features[fields.InputDataFields.image])
            eval_dict = eval_util.result_dict_for_single_example(
                eval_images[0:1],
                features[inputs.HASH_KEY][0],
                detections,
                groundtruth,
                class_agnostic=class_agnostic,
                scale_to_absolute=True)

            if class_agnostic:
                category_index = label_map_util.create_class_agnostic_category_index(
                )
            else:
                category_index = label_map_util.create_category_index_from_labelmap(
                    eval_input_config.label_map_path)
            img_summary = None
            if not use_tpu and use_original_images:
                detection_and_groundtruth = (
                    vis_utils.draw_side_by_side_evaluation_image(
                        eval_dict,
                        category_index,
                        max_boxes_to_draw=20,
                        min_score_thresh=0.2,
                        use_normalized_coordinates=False))
                img_summary = tf.summary.image(
                    'Detections_Left_Groundtruth_Right',
                    detection_and_groundtruth)

            # Eval metrics on a single example.
            eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
                eval_config, category_index.values(), eval_dict)
            for loss_key, loss_tensor in iter(losses_dict.items()):
                eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
            for var in optimizer_summary_vars:
                eval_metric_ops[var.op.name] = (var, tf.no_op())
            if img_summary is not None:
                eval_metric_ops['Detections_Left_Groundtruth_Right'] = (
                    img_summary, tf.no_op())
            eval_metric_ops = {str(k): v for k, v in eval_metric_ops.items()}

            if eval_config.use_moving_averages:
                variable_averages = tf.train.ExponentialMovingAverage(0.0)
                variables_to_restore = variable_averages.variables_to_restore()
                keep_checkpoint_every_n_hours = (
                    train_config.keep_checkpoint_every_n_hours)
                saver = tf.train.Saver(
                    variables_to_restore,
                    keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours
                )
                scaffold = tf.train.Scaffold(saver=saver)

        # EVAL executes on CPU, so use regular non-TPU EstimatorSpec.
        if use_tpu and mode != tf.estimator.ModeKeys.EVAL:
            return tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                scaffold_fn=scaffold_fn,
                predictions=detections,
                loss=total_loss,
                train_op=train_op,
                eval_metrics=eval_metric_ops,
                export_outputs=export_outputs)
        else:
            return tf.estimator.EstimatorSpec(mode=mode,
                                              predictions=detections,
                                              loss=total_loss,
                                              train_op=train_op,
                                              eval_metric_ops=eval_metric_ops,
                                              export_outputs=export_outputs,
                                              scaffold=scaffold)
示例#30
0
def main(_):
    assert FLAGS.train_dir, '`train_dir` is missing.'
    if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir)
    if FLAGS.pipeline_config_path:
        configs = config_util.get_configs_from_pipeline_file(
            FLAGS.pipeline_config_path)
        if FLAGS.task == 0:
            tf.gfile.Copy(FLAGS.pipeline_config_path,
                          os.path.join(FLAGS.train_dir, 'pipeline.config'),
                          overwrite=True)
    else:
        configs = config_util.get_configs_from_multiple_files(
            model_config_path=FLAGS.model_config_path,
            train_config_path=FLAGS.train_config_path,
            train_input_config_path=FLAGS.input_config_path)
        if FLAGS.task == 0:
            for name, config in [('model.config', FLAGS.model_config_path),
                                 ('train.config', FLAGS.train_config_path),
                                 ('input.config', FLAGS.input_config_path)]:
                print(FLAGS.train_dir, name)
                tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name), overwrite=True)

    model_config = configs['model']
    train_config = configs['train_config']
    input_config = configs['train_input_config']

    model_fn = functools.partial(
        model_builder.build,
        model_config=model_config,
        is_training=True)

    def get_next(config):
        return dataset_builder.make_initializable_iterator(
            dataset_builder.build(config)).get_next()

    create_input_dict_fn = functools.partial(get_next, input_config)

    env = json.loads(os.environ.get('TF_CONFIG', '{}'))
    cluster_data = env.get('cluster', None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    task_info = type('TaskSpec', (object,), task_data)

    # Parameters for a single worker.
    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''

    if cluster_data and 'worker' in cluster_data:
        # Number of total worker replicas include "worker"s and the "master".
        worker_replicas = len(cluster_data['worker']) + 1
    if cluster_data and 'ps' in cluster_data:
        ps_tasks = len(cluster_data['ps'])

    if worker_replicas > 1 and ps_tasks < 1:
        raise ValueError('At least 1 ps task is needed for distributed training.')

    if worker_replicas >= 1 and ps_tasks > 0:
        # Set up distributed training.
        server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc',
                                 job_name=task_info.type,
                                 task_index=task_info.index)
        if task_info.type == 'ps':
            server.join()
            return

        worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
        task = task_info.index
        is_chief = (task_info.type == 'master')
        master = server.target

    graph_rewriter_fn = None
    if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
            configs['graph_rewriter_config'], is_training=True)

    trainer.train(
        create_input_dict_fn,
        model_fn,
        train_config,
        master,
        task,
        FLAGS.num_clones,
        worker_replicas,
        FLAGS.clone_on_cpu,
        ps_tasks,
        worker_job_name,
        is_chief,
        FLAGS.train_dir,
        graph_hook_fn=graph_rewriter_fn)
示例#31
0
def main(train_dir, pipeline_config_path, train_config_path="", input_config_path="",
         model_config_path="", master="", task=0, num_clones=1, 
         clone_on_cpu=False, worker_replicas=1, ps_tasks=0):
  """
  DEFINE_string('master', '', 'Name of the TensorFlow master to use.')
  DEFINE_integer('task', 0, 'task id')
  DEFINE_integer('num_clones', 1, 'Number of clones to deploy per worker.')
  DEFINE_boolean('clone_on_cpu', False,
                       'Force clones to be deployed on CPU.  Note that even if '
                       'set to False (allowing ops to run on gpu), some ops may '
                       'still be run on the CPU if they have no GPU kernel.')
  DEFINE_integer('worker_replicas', 1, 'Number of worker+trainer '
                       'replicas.')
  DEFINE_integer('ps_tasks', 0,
                       'Number of parameter server tasks. If None, does not use '
                       'a parameter server.')
  DEFINE_string('train_dir', '',
                      'Directory to save the checkpoints and training summaries.')

  DEFINE_string('pipeline_config_path', '',
                      'Path to a pipeline_pb2.TrainEvalPipelineConfig config '
                      'file. If provided, other configs are ignored')

  DEFINE_string('train_config_path', '',
                      'Path to a train_pb2.TrainConfig config file.')
  DEFINE_string('input_config_path', '',
                      'Path to an input_reader_pb2.InputReader config file.')
  DEFINE_string('model_config_path', '',
                      'Path to a model_pb2.DetectionModel config file.')
  """
  tf.logging.set_verbosity(tf.logging.INFO)

  if task == 0: tf.gfile.MakeDirs(train_dir)
  if pipeline_config_path:
    configs = config_util.get_configs_from_pipeline_file(
        pipeline_config_path)
    if task == 0:
      tf.gfile.Copy(pipeline_config_path,
                    os.path.join(train_dir, 'pipeline.config'),
                    overwrite=True)
  else:
    configs = config_util.get_configs_from_multiple_files(
        model_config_path=model_config_path,
        train_config_path=train_config_path,
        train_input_config_path=input_config_path)
    if task == 0:
      for name, config in [('model.config', model_config_path),
                           ('train.config', train_config_path),
                           ('input.config', input_config_path)]:
        tf.gfile.Copy(config, os.path.join(train_dir, name),
                      overwrite=True)

  model_config = configs['model']
  train_config = configs['train_config']
  input_config = configs['train_input_config']

  model_fn = functools.partial(
      model_builder.build,
      model_config=model_config,
      is_training=True)

  def get_next(config):
    return dataset_builder.make_initializable_iterator(
        dataset_builder.build(config)).get_next()

  create_input_dict_fn = functools.partial(get_next, input_config)

  env = json.loads(os.environ.get('TF_CONFIG', '{}'))
  cluster_data = env.get('cluster', None)
  cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
  task_data = env.get('task', None) or {'type': 'master', 'index': 0}
  task_info = type('TaskSpec', (object,), task_data)

  # Parameters for a single worker.
  ps_tasks = 0
  worker_replicas = 1
  worker_job_name = 'lonely_worker'
  task = 0
  is_chief = True
  master = ''

  if cluster_data and 'worker' in cluster_data:
    # Number of total worker replicas include "worker"s and the "master".
    worker_replicas = len(cluster_data['worker']) + 1
  if cluster_data and 'ps' in cluster_data:
    ps_tasks = len(cluster_data['ps'])

  if worker_replicas > 1 and ps_tasks < 1:
    raise ValueError('At least 1 ps task is needed for distributed training.')

  if worker_replicas >= 1 and ps_tasks > 0:
    # Set up distributed training.
    server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc',
                             job_name=task_info.type,
                             task_index=task_info.index)
    if task_info.type == 'ps':
      server.join()
      return

    worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
    task = task_info.index
    is_chief = (task_info.type == 'master')
    master = server.target

  graph_rewriter_fn = None
  if 'graph_rewriter_config' in configs:
    graph_rewriter_fn = graph_rewriter_builder.build(
        configs['graph_rewriter_config'], is_training=True)

  print("\n\n\n\n\nMADE IT HERE\n\n\n\n\n\n")

  trainer.train(
      create_input_dict_fn,
      model_fn,
      train_config,
      master,
      task,
      num_clones,
      worker_replicas,
      clone_on_cpu,
      ps_tasks,
      worker_job_name,
      is_chief,
      train_dir,
      graph_hook_fn=graph_rewriter_fn)
  print("MADE IT THERE")
示例#32
0
def main(_):
    assert FLAGS.train_dir, '`train_dir` is missing.'
    if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir)  # tf.gfile模块创建一个目录
    if FLAGS.pipeline_config_path:
        configs = config_util.get_configs_from_pipeline_file(
            FLAGS.pipeline_config_path
        )  #读取pipeline_config_path配置文件,返回一个dict,保存配置文件中`model`, `train_config`,
        #`train_input_config`, `eval_config`, `eval_input_config`信息
        if FLAGS.task == 0:
            tf.gfile.Copy(
                FLAGS.pipeline_config_path,
                os.path.join(FLAGS.train_dir, 'pipeline.config'),
                overwrite=True
            )  #把pipeline_config_path配置文件复制到train_dir目录下,命名为pipeline.config
    else:
        configs = config_util.get_configs_from_multiple_files(
            model_config_path=FLAGS.model_config_path,
            train_config_path=FLAGS.train_config_path,
            train_input_config_path=FLAGS.input_config_path
        )  #读取model_config_path、train_config_path、train_input_config_path的路径
        if FLAGS.task == 0:
            for name, config in [('model.config', FLAGS.model_config_path),
                                 ('train.config', FLAGS.train_config_path),
                                 ('input.config', FLAGS.input_config_path)]:
                tf.gfile.Copy(config,
                              os.path.join(FLAGS.train_dir, name),
                              overwrite=True)

    model_config = configs['model']
    train_config = configs['train_config']
    input_config = configs['train_input_config']
    """"
  以下这行代码为核心代码,通过传入部分所需要的参数并且 “重新定义” 函数名称。这样简化函数,更少更灵活的函数参数调用。 
  通过functools.partial函数对model_builder.build函数赋予默认值,该目录下有一个model_builder模块,包含了生成网络模型的代码,
  包含ssd,fast_rcnn等众多模型代码,部分代码如下所示
  def build(model_config, is_training):
      if not isinstance(model_config, model_pb2.DetectionModel):
          raise ValueError('model_config not of type model_pb2.DetectionModel.')
      # 获取配置中的模型种类
      meta_architecture = model_config.WhichOneof('model')
      # 进行具体加载
      if meta_architecture == 'ssd':
          return _build_ssd_model(model_config.ssd, is_training)
      if meta_architecture == 'faster_rcnn':
          return _build_faster_rcnn_model(model_config.faster_rcnn, is_training)
      raise ValueError('Unknown meta architecture: {}'.format(meta_architecture))
      以'faster_rcnn模型为例子,进入_build_faster_rcnn_model(仍在model_builder.py文件中),该类中定义了fast_rcnn所有的参数
      之后说明每一个子模型的构建,比如image_resizer_builder的构建
      """

    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=True)

    #第二阶段中的参数配置
    def get_next(config):
        return dataset_builder.make_initializable_iterator(
            dataset_builder.build(config)).get_next()

    create_input_dict_fn = functools.partial(get_next, input_config)
    #python解码JSON对象
    env = json.loads(os.environ.get('TF_CONFIG', '{}'))
    cluster_data = env.get('cluster', None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    task_info = type('TaskSpec', (object, ), task_data)

    # Parameters for a single worker.
    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''

    if cluster_data and 'worker' in cluster_data:
        # Number of total worker replicas include "worker"s and the "master".
        worker_replicas = len(cluster_data['worker']) + 1
    if cluster_data and 'ps' in cluster_data:
        ps_tasks = len(cluster_data['ps'])

    if worker_replicas > 1 and ps_tasks < 1:
        raise ValueError(
            'At least 1 ps task is needed for distributed training.')

    if worker_replicas >= 1 and ps_tasks > 0:
        # Set up distributed training.
        server = tf.train.Server(tf.train.ClusterSpec(cluster),
                                 protocol='grpc',
                                 job_name=task_info.type,
                                 task_index=task_info.index)
        if task_info.type == 'ps':
            server.join()
            return

        worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
        task = task_info.index
        is_chief = (task_info.type == 'master')
        master = server.target

    graph_rewriter_fn = None
    if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
            configs['graph_rewriter_config'], is_training=True)

    trainer.train(create_input_dict_fn,
                  model_fn,
                  train_config,
                  master,
                  task,
                  FLAGS.num_clones,
                  worker_replicas,
                  FLAGS.clone_on_cpu,
                  ps_tasks,
                  worker_job_name,
                  is_chief,
                  FLAGS.train_dir,
                  graph_hook_fn=graph_rewriter_fn)
示例#33
0
def export_tflite_graph(pipeline_config,
                        trained_checkpoint_prefix,
                        output_dir,
                        add_postprocessing_op,
                        max_detections,
                        max_classes_per_detection,
                        detections_per_class=100,
                        use_regular_nms=False):
  """Exports a tflite compatible graph and anchors for ssd detection model.

  Anchors are written to a tensor and tflite compatible graph
  is written to output_dir/tflite_graph.pb.

  Args:
    pipeline_config: a pipeline.proto object containing the configuration for
      SSD model to export.
    trained_checkpoint_prefix: a file prefix for the checkpoint containing the
      trained parameters of the SSD model.
    output_dir: A directory to write the tflite graph and anchor file to.
    add_postprocessing_op: If add_postprocessing_op is true: frozen graph adds a
      TFLite_Detection_PostProcess custom op
    max_detections: Maximum number of detections (boxes) to show
    max_classes_per_detection: Number of classes to display per detection
    detections_per_class: In regular NonMaxSuppression, number of anchors used
    for NonMaxSuppression per class
    use_regular_nms: Flag to set postprocessing op to use Regular NMS instead
      of Fast NMS.

  Raises:
    ValueError: if the pipeline config contains models other than ssd or uses an
      fixed_shape_resizer and provides a shape as well.
  """
  tf.gfile.MakeDirs(output_dir)
  if pipeline_config.model.WhichOneof('model') != 'ssd':
    raise ValueError('Only ssd models are supported in tflite. '
                     'Found {} in config'.format(
                         pipeline_config.model.WhichOneof('model')))

  num_classes = pipeline_config.model.ssd.num_classes
  nms_score_threshold = {
      pipeline_config.model.ssd.post_processing.batch_non_max_suppression.
      score_threshold
  }
  nms_iou_threshold = {
      pipeline_config.model.ssd.post_processing.batch_non_max_suppression.
      iou_threshold
  }
  scale_values = {}
  scale_values['y_scale'] = {
      pipeline_config.model.ssd.box_coder.faster_rcnn_box_coder.y_scale
  }
  scale_values['x_scale'] = {
      pipeline_config.model.ssd.box_coder.faster_rcnn_box_coder.x_scale
  }
  scale_values['h_scale'] = {
      pipeline_config.model.ssd.box_coder.faster_rcnn_box_coder.height_scale
  }
  scale_values['w_scale'] = {
      pipeline_config.model.ssd.box_coder.faster_rcnn_box_coder.width_scale
  }

  image_resizer_config = pipeline_config.model.ssd.image_resizer
  image_resizer = image_resizer_config.WhichOneof('image_resizer_oneof')
  num_channels = _DEFAULT_NUM_CHANNELS
  if image_resizer == 'fixed_shape_resizer':
    height = image_resizer_config.fixed_shape_resizer.height
    width = image_resizer_config.fixed_shape_resizer.width
    if image_resizer_config.fixed_shape_resizer.convert_to_grayscale:
      num_channels = 1
    shape = [1, height, width, num_channels]
  else:
    raise ValueError(
        'Only fixed_shape_resizer'
        'is supported with tflite. Found {}'.format(
            image_resizer_config.WhichOneof('image_resizer_oneof')))

  image = tf.placeholder(
      tf.float32, shape=shape, name='normalized_input_image_tensor')

  detection_model = model_builder.build(
      pipeline_config.model, is_training=False)
  predicted_tensors = detection_model.predict(image, true_image_shapes=None)
  # The score conversion occurs before the post-processing custom op
  _, score_conversion_fn = post_processing_builder.build(
      pipeline_config.model.ssd.post_processing)
  class_predictions = score_conversion_fn(
      predicted_tensors['class_predictions_with_background'])

  with tf.name_scope('raw_outputs'):
    # 'raw_outputs/box_encodings': a float32 tensor of shape [1, num_anchors, 4]
    #  containing the encoded box predictions. Note that these are raw
    #  predictions and no Non-Max suppression is applied on them and
    #  no decode center size boxes is applied to them.
    tf.identity(predicted_tensors['box_encodings'], name='box_encodings')
    # 'raw_outputs/class_predictions': a float32 tensor of shape
    #  [1, num_anchors, num_classes] containing the class scores for each anchor
    #  after applying score conversion.
    tf.identity(class_predictions, name='class_predictions')
  # 'anchors': a float32 tensor of shape
  #   [4, num_anchors] containing the anchors as a constant node.
  tf.identity(
      get_const_center_size_encoded_anchors(predicted_tensors['anchors']),
      name='anchors')

  # Add global step to the graph, so we know the training step number when we
  # evaluate the model.
  tf.train.get_or_create_global_step()

  # graph rewriter
  is_quantized = pipeline_config.HasField('graph_rewriter')
  if is_quantized:
    graph_rewriter_config = pipeline_config.graph_rewriter
    graph_rewriter_fn = graph_rewriter_builder.build(
        graph_rewriter_config, is_training=False)
    graph_rewriter_fn()

  if pipeline_config.model.ssd.feature_extractor.HasField('fpn'):
    exporter.rewrite_nn_resize_op(is_quantized)

  # freeze the graph
  saver_kwargs = {}
  if pipeline_config.eval_config.use_moving_averages:
    saver_kwargs['write_version'] = saver_pb2.SaverDef.V1
    moving_average_checkpoint = tempfile.NamedTemporaryFile()
    exporter.replace_variable_values_with_moving_averages(
        tf.get_default_graph(), trained_checkpoint_prefix,
        moving_average_checkpoint.name)
    checkpoint_to_use = moving_average_checkpoint.name
  else:
    checkpoint_to_use = trained_checkpoint_prefix

  saver = tf.train.Saver(**saver_kwargs)
  input_saver_def = saver.as_saver_def()
  frozen_graph_def = exporter.freeze_graph_with_def_protos(
      input_graph_def=tf.get_default_graph().as_graph_def(),
      input_saver_def=input_saver_def,
      input_checkpoint=checkpoint_to_use,
      output_node_names=','.join([
          'raw_outputs/box_encodings', 'raw_outputs/class_predictions',
          'anchors'
      ]),
      restore_op_name='save/restore_all',
      filename_tensor_name='save/Const:0',
      clear_devices=True,
      output_graph='',
      initializer_nodes='')

  # Add new operation to do post processing in a custom op (TF Lite only)
  if add_postprocessing_op:
    transformed_graph_def = append_postprocessing_op(
        frozen_graph_def, max_detections, max_classes_per_detection,
        nms_score_threshold, nms_iou_threshold, num_classes, scale_values,
        detections_per_class, use_regular_nms)
  else:
    # Return frozen without adding post-processing custom op
    transformed_graph_def = frozen_graph_def

  binary_graph = os.path.join(output_dir, 'tflite_graph.pb')
  with tf.gfile.GFile(binary_graph, 'wb') as f:
    f.write(transformed_graph_def.SerializeToString())
  txt_graph = os.path.join(output_dir, 'tflite_graph.pbtxt')
  with tf.gfile.GFile(txt_graph, 'w') as f:
    f.write(str(transformed_graph_def))
示例#34
0
  def model_fn(features, labels, mode, params=None):
    """Constructs the object detection model.

    Args:
      features: Dictionary of feature tensors, returned from `input_fn`.
      labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL,
        otherwise None.
      mode: Mode key from tf.estimator.ModeKeys.
      params: Parameter dictionary passed from the estimator.

    Returns:
      An `EstimatorSpec` that encapsulates the model and its serving
        configurations.
    """
    params = params or {}
    total_loss, train_op, detections, export_outputs = None, None, None, None
    is_training = mode == tf.estimator.ModeKeys.TRAIN
    detection_model = detection_model_fn(is_training=is_training,
                                         add_summaries=(not use_tpu))
    scaffold_fn = None

    if mode == tf.estimator.ModeKeys.TRAIN:
      labels = unstack_batch(
          labels,
          unpad_groundtruth_tensors=train_config.unpad_groundtruth_tensors)
    elif mode == tf.estimator.ModeKeys.EVAL:
      # For evaling on train data, it is necessary to check whether groundtruth
      # must be unpadded.
      boxes_shape = (
          labels[fields.InputDataFields.groundtruth_boxes].get_shape()
          .as_list())
      unpad_groundtruth_tensors = True if boxes_shape[1] is not None else False
      labels = unstack_batch(
          labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)

    if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
      gt_boxes_list = labels[fields.InputDataFields.groundtruth_boxes]
      gt_classes_list = labels[fields.InputDataFields.groundtruth_classes]
      gt_masks_list = None
      if fields.InputDataFields.groundtruth_instance_masks in labels:
        gt_masks_list = labels[
            fields.InputDataFields.groundtruth_instance_masks]
      gt_keypoints_list = None
      if fields.InputDataFields.groundtruth_keypoints in labels:
        gt_keypoints_list = labels[fields.InputDataFields.groundtruth_keypoints]
      if fields.InputDataFields.groundtruth_is_crowd in labels:
        gt_is_crowd_list = labels[fields.InputDataFields.groundtruth_is_crowd]
      detection_model.provide_groundtruth(
          groundtruth_boxes_list=gt_boxes_list,
          groundtruth_classes_list=gt_classes_list,
          groundtruth_masks_list=gt_masks_list,
          groundtruth_keypoints_list=gt_keypoints_list,
          groundtruth_weights_list=labels[
              fields.InputDataFields.groundtruth_weights],
          groundtruth_is_crowd_list=gt_is_crowd_list)

    preprocessed_images = features[fields.InputDataFields.image]
    prediction_dict = detection_model.predict(
        preprocessed_images, features[fields.InputDataFields.true_image_shape])
    detections = detection_model.postprocess(
        prediction_dict, features[fields.InputDataFields.true_image_shape])

    if mode == tf.estimator.ModeKeys.TRAIN:
      if train_config.fine_tune_checkpoint and hparams.load_pretrained:
        if not train_config.fine_tune_checkpoint_type:
          # train_config.from_detection_checkpoint field is deprecated. For
          # backward compatibility, set train_config.fine_tune_checkpoint_type
          # based on train_config.from_detection_checkpoint.
          if train_config.from_detection_checkpoint:
            train_config.fine_tune_checkpoint_type = 'detection'
          else:
            train_config.fine_tune_checkpoint_type = 'classification'
        asg_map = detection_model.restore_map(
            fine_tune_checkpoint_type=train_config.fine_tune_checkpoint_type,
            load_all_detection_checkpoint_vars=(
                train_config.load_all_detection_checkpoint_vars))
        available_var_map = (
            variables_helper.get_variables_available_in_checkpoint(
                asg_map, train_config.fine_tune_checkpoint,
                include_global_step=False))
        if use_tpu:
          def tpu_scaffold():
            tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint,
                                          available_var_map)
            return tf.train.Scaffold()
          scaffold_fn = tpu_scaffold
        else:
          tf.train.init_from_checkpoint(train_config.fine_tune_checkpoint,
                                        available_var_map)

    if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
      losses_dict = detection_model.loss(
          prediction_dict, features[fields.InputDataFields.true_image_shape])
      losses = [loss_tensor for loss_tensor in losses_dict.itervalues()]
      if train_config.add_regularization_loss:
        regularization_losses = tf.get_collection(
            tf.GraphKeys.REGULARIZATION_LOSSES)
        if regularization_losses:
          regularization_loss = tf.add_n(regularization_losses,
                                         name='regularization_loss')
          losses.append(regularization_loss)
          losses_dict['Loss/regularization_loss'] = regularization_loss
      total_loss = tf.add_n(losses, name='total_loss')
      losses_dict['Loss/total_loss'] = total_loss

      if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
            configs['graph_rewriter_config'], is_training=is_training)
        graph_rewriter_fn()

      # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we
      # can write learning rate summaries on TPU without host calls.
      global_step = tf.train.get_or_create_global_step()
      training_optimizer, optimizer_summary_vars = optimizer_builder.build(
          train_config.optimizer)

    if mode == tf.estimator.ModeKeys.TRAIN:
      if use_tpu:
        training_optimizer = tf.contrib.tpu.CrossShardOptimizer(
            training_optimizer)

      # Optionally freeze some layers by setting their gradients to be zero.
      trainable_variables = None
      if train_config.freeze_variables:
        trainable_variables = tf.contrib.framework.filter_variables(
            tf.trainable_variables(),
            exclude_patterns=train_config.freeze_variables)

      clip_gradients_value = None
      if train_config.gradient_clipping_by_norm > 0:
        clip_gradients_value = train_config.gradient_clipping_by_norm

      if not use_tpu:
        for var in optimizer_summary_vars:
          tf.summary.scalar(var.op.name, var)
      summaries = [] if use_tpu else None
      train_op = tf.contrib.layers.optimize_loss(
          loss=total_loss,
          global_step=global_step,
          learning_rate=None,
          clip_gradients=clip_gradients_value,
          optimizer=training_optimizer,
          variables=trainable_variables,
          summaries=summaries,
          name='')  # Preventing scope prefix on all variables.

    if mode == tf.estimator.ModeKeys.PREDICT:
      export_outputs = {
          tf.saved_model.signature_constants.PREDICT_METHOD_NAME:
              tf.estimator.export.PredictOutput(detections)
      }

    eval_metric_ops = None
    scaffold = None
    if mode == tf.estimator.ModeKeys.EVAL:
      class_agnostic = (fields.DetectionResultFields.detection_classes
                        not in detections)
      groundtruth = _prepare_groundtruth_for_eval(
          detection_model, class_agnostic)
      use_original_images = fields.InputDataFields.original_image in features
      eval_images = (
          features[fields.InputDataFields.original_image] if use_original_images
          else features[fields.InputDataFields.image])
      eval_dict = eval_util.result_dict_for_single_example(
          eval_images[0:1],
          features[inputs.HASH_KEY][0],
          detections,
          groundtruth,
          class_agnostic=class_agnostic,
          scale_to_absolute=True)

      if class_agnostic:
        category_index = label_map_util.create_class_agnostic_category_index()
      else:
        category_index = label_map_util.create_category_index_from_labelmap(
            eval_input_config.label_map_path)
      img_summary = None
      if not use_tpu and use_original_images:
        detection_and_groundtruth = (
            vis_utils.draw_side_by_side_evaluation_image(
                eval_dict, category_index, max_boxes_to_draw=20,
                min_score_thresh=0.2,
                use_normalized_coordinates=False))
        img_summary = tf.summary.image('Detections_Left_Groundtruth_Right',
                                       detection_and_groundtruth)

      # Eval metrics on a single example.
      eval_metrics = eval_config.metrics_set
      if not eval_metrics:
        eval_metrics = ['coco_detection_metrics']
      eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
          eval_metrics,
          category_index.values(),
          eval_dict,
          include_metrics_per_category=eval_config.include_metrics_per_category)
      for loss_key, loss_tensor in iter(losses_dict.items()):
        eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
      for var in optimizer_summary_vars:
        eval_metric_ops[var.op.name] = (var, tf.no_op())
      if img_summary is not None:
        eval_metric_ops['Detections_Left_Groundtruth_Right'] = (
            img_summary, tf.no_op())
      eval_metric_ops = {str(k): v for k, v in eval_metric_ops.iteritems()}

      if eval_config.use_moving_averages:
        variable_averages = tf.train.ExponentialMovingAverage(0.0)
        variables_to_restore = variable_averages.variables_to_restore()
        keep_checkpoint_every_n_hours = (
            train_config.keep_checkpoint_every_n_hours)
        saver = tf.train.Saver(
            variables_to_restore,
            keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours)
        scaffold = tf.train.Scaffold(saver=saver)

    if use_tpu:
      return tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          scaffold_fn=scaffold_fn,
          predictions=detections,
          loss=total_loss,
          train_op=train_op,
          eval_metrics=eval_metric_ops,
          export_outputs=export_outputs)
    else:
      return tf.estimator.EstimatorSpec(
          mode=mode,
          predictions=detections,
          loss=total_loss,
          train_op=train_op,
          eval_metric_ops=eval_metric_ops,
          export_outputs=export_outputs,
          scaffold=scaffold)
示例#35
0
    def model_fn(features, labels, mode, params=None):
        """Constructs the object detection model.

    Args:
      features: Dictionary of feature tensors, returned from `input_fn`.
      labels: Dictionary of groundtruth tensors if mode is TRAIN or EVAL,
        otherwise None.
      mode: Mode key from tf.estimator.ModeKeys.
      params: Parameter dictionary passed from the estimator.

    Returns:
      An `EstimatorSpec` that encapsulates the model and its serving
        configurations.
    """
        params = params or {}
        total_loss, train_op, detections, export_outputs = None, None, None, None
        is_training = mode == tf.estimator.ModeKeys.TRAIN

        # Make sure to set the Keras learning phase. True during training,
        # False for inference.
        tf.keras.backend.set_learning_phase(is_training)
        # Set policy for mixed-precision training with Keras-based models.
        if use_tpu and train_config.use_bfloat16:
            from tensorflow.python.keras.engine import base_layer_utils  # pylint: disable=g-import-not-at-top
            # Enable v2 behavior, as `mixed_bfloat16` is only supported in TF 2.0.
            base_layer_utils.enable_v2_dtype_behavior()
            tf2.keras.mixed_precision.experimental.set_policy('mixed_bfloat16')
        detection_model = detection_model_fn(is_training=is_training,
                                             add_summaries=(not use_tpu))
        scaffold_fn = None

        if mode == tf.estimator.ModeKeys.TRAIN:
            labels = unstack_batch(labels,
                                   unpad_groundtruth_tensors=train_config.
                                   unpad_groundtruth_tensors)
        elif mode == tf.estimator.ModeKeys.EVAL:
            # For evaling on train data, it is necessary to check whether groundtruth
            # must be unpadded.
            boxes_shape = (labels[fields.InputDataFields.groundtruth_boxes].
                           get_shape().as_list())
            unpad_groundtruth_tensors = boxes_shape[
                1] is not None and not use_tpu
            labels = unstack_batch(
                labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors)

        if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
            provide_groundtruth(detection_model, labels)

        preprocessed_images = features[fields.InputDataFields.image]

        side_inputs = detection_model.get_side_inputs(features)

        if use_tpu and train_config.use_bfloat16:
            with tf.tpu.bfloat16_scope():
                prediction_dict = detection_model.predict(
                    preprocessed_images,
                    features[fields.InputDataFields.true_image_shape],
                    **side_inputs)
                prediction_dict = ops.bfloat16_to_float32_nested(
                    prediction_dict)
        else:
            prediction_dict = detection_model.predict(
                preprocessed_images,
                features[fields.InputDataFields.true_image_shape],
                **side_inputs)

        def postprocess_wrapper(args):
            return detection_model.postprocess(args[0], args[1])

        if mode in (tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT):
            if use_tpu and postprocess_on_cpu:
                detections = tf.tpu.outside_compilation(
                    postprocess_wrapper,
                    (prediction_dict,
                     features[fields.InputDataFields.true_image_shape]))
            else:
                detections = postprocess_wrapper(
                    (prediction_dict,
                     features[fields.InputDataFields.true_image_shape]))

        if mode == tf.estimator.ModeKeys.TRAIN:
            load_pretrained = hparams.load_pretrained if hparams else False
            if train_config.fine_tune_checkpoint and load_pretrained:
                if not train_config.fine_tune_checkpoint_type:
                    # train_config.from_detection_checkpoint field is deprecated. For
                    # backward compatibility, set train_config.fine_tune_checkpoint_type
                    # based on train_config.from_detection_checkpoint.
                    if train_config.from_detection_checkpoint:
                        train_config.fine_tune_checkpoint_type = 'detection'
                    else:
                        train_config.fine_tune_checkpoint_type = 'classification'
                asg_map = detection_model.restore_map(
                    fine_tune_checkpoint_type=train_config.
                    fine_tune_checkpoint_type,
                    load_all_detection_checkpoint_vars=(
                        train_config.load_all_detection_checkpoint_vars))
                available_var_map = (
                    variables_helper.get_variables_available_in_checkpoint(
                        asg_map,
                        train_config.fine_tune_checkpoint,
                        include_global_step=False))
                if use_tpu:

                    def tpu_scaffold():
                        tf.train.init_from_checkpoint(
                            train_config.fine_tune_checkpoint,
                            available_var_map)
                        return tf.train.Scaffold()

                    scaffold_fn = tpu_scaffold
                else:
                    tf.train.init_from_checkpoint(
                        train_config.fine_tune_checkpoint, available_var_map)

        if mode in (tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.EVAL):
            if (mode == tf.estimator.ModeKeys.EVAL
                    and eval_config.use_dummy_loss_in_eval):
                total_loss = tf.constant(1.0)
                losses_dict = {'Loss/total_loss': total_loss}
            else:
                losses_dict = detection_model.loss(
                    prediction_dict,
                    features[fields.InputDataFields.true_image_shape])
                losses = [loss_tensor for loss_tensor in losses_dict.values()]
                if train_config.add_regularization_loss:
                    regularization_losses = detection_model.regularization_losses(
                    )
                    if use_tpu and train_config.use_bfloat16:
                        regularization_losses = ops.bfloat16_to_float32_nested(
                            regularization_losses)
                    if regularization_losses:
                        regularization_loss = tf.add_n(
                            regularization_losses, name='regularization_loss')
                        losses.append(regularization_loss)
                        losses_dict[
                            'Loss/regularization_loss'] = regularization_loss
                total_loss = tf.add_n(losses, name='total_loss')
                losses_dict['Loss/total_loss'] = total_loss

            if 'graph_rewriter_config' in configs:
                graph_rewriter_fn = graph_rewriter_builder.build(
                    configs['graph_rewriter_config'], is_training=is_training)
                graph_rewriter_fn()

            # TODO(rathodv): Stop creating optimizer summary vars in EVAL mode once we
            # can write learning rate summaries on TPU without host calls.
            global_step = tf.train.get_or_create_global_step()
            training_optimizer, optimizer_summary_vars = optimizer_builder.build(
                train_config.optimizer)

        if mode == tf.estimator.ModeKeys.TRAIN:
            if use_tpu:
                training_optimizer = tf.tpu.CrossShardOptimizer(
                    training_optimizer)

            # Optionally freeze some layers by setting their gradients to be zero.
            trainable_variables = None
            include_variables = (train_config.update_trainable_variables
                                 if train_config.update_trainable_variables
                                 else None)
            exclude_variables = (train_config.freeze_variables
                                 if train_config.freeze_variables else None)
            trainable_variables = slim.filter_variables(
                tf.trainable_variables(),
                include_patterns=include_variables,
                exclude_patterns=exclude_variables)

            clip_gradients_value = None
            if train_config.gradient_clipping_by_norm > 0:
                clip_gradients_value = train_config.gradient_clipping_by_norm

            if not use_tpu:
                for var in optimizer_summary_vars:
                    tf.summary.scalar(var.op.name, var)
            summaries = [] if use_tpu else None
            if train_config.summarize_gradients:
                summaries = [
                    'gradients', 'gradient_norm', 'global_gradient_norm'
                ]
            train_op = slim.optimizers.optimize_loss(
                loss=total_loss,
                global_step=global_step,
                learning_rate=None,
                clip_gradients=clip_gradients_value,
                optimizer=training_optimizer,
                update_ops=detection_model.updates(),
                variables=trainable_variables,
                summaries=summaries,
                name='')  # Preventing scope prefix on all variables.

        if mode == tf.estimator.ModeKeys.PREDICT:
            exported_output = exporter_lib.add_output_tensor_nodes(detections)
            export_outputs = {
                tf.saved_model.signature_constants.PREDICT_METHOD_NAME:
                tf.estimator.export.PredictOutput(exported_output)
            }

        eval_metric_ops = None
        scaffold = None
        if mode == tf.estimator.ModeKeys.EVAL:
            class_agnostic = (fields.DetectionResultFields.detection_classes
                              not in detections)
            groundtruth = _prepare_groundtruth_for_eval(
                detection_model, class_agnostic,
                eval_input_config.max_number_of_boxes)
            use_original_images = fields.InputDataFields.original_image in features
            if use_original_images:
                eval_images = features[fields.InputDataFields.original_image]
                true_image_shapes = tf.slice(
                    features[fields.InputDataFields.true_image_shape], [0, 0],
                    [-1, 3])
                original_image_spatial_shapes = features[
                    fields.InputDataFields.original_image_spatial_shape]
            else:
                eval_images = features[fields.InputDataFields.image]
                true_image_shapes = None
                original_image_spatial_shapes = None

            eval_dict = eval_util.result_dict_for_batched_example(
                eval_images,
                features[inputs.HASH_KEY],
                detections,
                groundtruth,
                class_agnostic=class_agnostic,
                scale_to_absolute=True,
                original_image_spatial_shapes=original_image_spatial_shapes,
                true_image_shapes=true_image_shapes)

            if fields.InputDataFields.image_additional_channels in features:
                eval_dict[fields.InputDataFields.
                          image_additional_channels] = features[
                              fields.InputDataFields.image_additional_channels]

            if class_agnostic:
                category_index = label_map_util.create_class_agnostic_category_index(
                )
            else:
                category_index = label_map_util.create_category_index_from_labelmap(
                    eval_input_config.label_map_path)
            vis_metric_ops = None
            if not use_tpu and use_original_images:
                keypoint_edges = [(kp.start, kp.end)
                                  for kp in eval_config.keypoint_edge]

                eval_metric_op_vis = vis_utils.VisualizeSingleFrameDetections(
                    category_index,
                    max_examples_to_draw=eval_config.num_visualizations,
                    max_boxes_to_draw=eval_config.max_num_boxes_to_visualize,
                    min_score_thresh=eval_config.min_score_threshold,
                    use_normalized_coordinates=False,
                    keypoint_edges=keypoint_edges or None)
                vis_metric_ops = eval_metric_op_vis.get_estimator_eval_metric_ops(
                    eval_dict)

            # Eval metrics on a single example.
            eval_metric_ops = eval_util.get_eval_metric_ops_for_evaluators(
                eval_config, list(category_index.values()), eval_dict)
            for loss_key, loss_tensor in iter(losses_dict.items()):
                eval_metric_ops[loss_key] = tf.metrics.mean(loss_tensor)
            for var in optimizer_summary_vars:
                eval_metric_ops[var.op.name] = (var, tf.no_op())
            if vis_metric_ops is not None:
                eval_metric_ops.update(vis_metric_ops)
            eval_metric_ops = {str(k): v for k, v in eval_metric_ops.items()}

            if eval_config.use_moving_averages:
                variable_averages = tf.train.ExponentialMovingAverage(0.0)
                variables_to_restore = variable_averages.variables_to_restore()
                keep_checkpoint_every_n_hours = (
                    train_config.keep_checkpoint_every_n_hours)
                saver = tf.train.Saver(
                    variables_to_restore,
                    keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours
                )
                scaffold = tf.train.Scaffold(saver=saver)

        # EVAL executes on CPU, so use regular non-TPU EstimatorSpec.
        if use_tpu and mode != tf.estimator.ModeKeys.EVAL:
            return tf.estimator.tpu.TPUEstimatorSpec(
                mode=mode,
                scaffold_fn=scaffold_fn,
                predictions=detections,
                loss=total_loss,
                train_op=train_op,
                eval_metrics=eval_metric_ops,
                export_outputs=export_outputs)
        else:
            if scaffold is None:
                keep_checkpoint_every_n_hours = (
                    train_config.keep_checkpoint_every_n_hours)
                saver = tf.train.Saver(
                    sharded=True,
                    keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours,
                    save_relative_paths=True)
                tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
                scaffold = tf.train.Scaffold(saver=saver)
            return tf.estimator.EstimatorSpec(mode=mode,
                                              predictions=detections,
                                              loss=total_loss,
                                              train_op=train_op,
                                              eval_metric_ops=eval_metric_ops,
                                              export_outputs=export_outputs,
                                              scaffold=scaffold)
示例#36
0
def get_train_config(task,
                     ps_tasks,
                     train_dir,
                     pipeline_config_path,
                     train_config_path,
                     model_config_path,
                     input_CONFIG_PATH,
                     worker_replicas,
                     master):
    """Set variables for training.
    """
    
    # Create the folder where to store the models
    if task == 0: tf.gfile.MakeDirs(train_dir)

    if pipeline_config_path:
        configs = config_util.get_configs_from_pipeline_file(
       pipeline_config_path)

        if task == 0:
            tf.gfile.Copy(pipeline_config_path,
                        os.path.join(train_dir, 'pipeline.config'),
                        overwrite=True)
    else:
        configs = config_util.get_configs_from_multiple_files(
        model_config_path=model_config_path,
        train_config_path=train_config_path,
        train_input_config_path=input_CONFIG_PATH)

        if TASK == 0:
          for name, config in [('model.config', model_config_path),
                               ('train.config', train_config_path),
                               ('input.config', input_CONFIG_PATH)]:
            tf.gfile.Copy(config, os.path.join(train_dir, name),
                          overwrite=True)

    model_config = configs['model']
    train_config = configs['train_config']
    input_config = configs['train_input_config']

    model_fn = functools.partial(
      model_builder.build,
      model_config=model_config,
      is_training=True)
    
    create_input_dict_fn = functools.partial(get_next, input_config)

    env = json.loads(os.environ.get('TF_CONFIG', '{}'))
    cluster_data = env.get('cluster', None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    task_info = type('TaskSpec', (object,), task_data)

    # Parameters for a single worker.
    worker_job_name = 'lonely_worker'
    is_chief = True

    if cluster_data and 'worker' in cluster_data:
        # Number of total worker replicas include "worker"s and the "master".
        worker_replicas = len(cluster_data['worker']) + 1

    if cluster_data and 'ps' in cluster_data:
        ps_tasks = len(cluster_data['ps'])

    if worker_replicas > 1 and ps_tasks < 1:
        raise ValueError('At least 1 ps task is needed for distributed training.')
        
    if worker_replicas >= 1 and ps_tasks > 0:
        # Set up distributed training.
        server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc',
                                 job_name=task_info.type,
                                 task_index=task_info.index)
        if task_info.type == 'ps':
            server.join()
            return

        worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
        task = task_info.index
        is_chief = (task_info.type == 'master')
        master = server.target

    graph_rewriter_fn = None
    if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
        configs['graph_rewriter_config'], is_training=True)
        
    return(create_input_dict_fn,
           model_fn,
           train_config,
           master,
           task,
           worker_job_name,
           ps_tasks,
           worker_replicas,
           is_chief,
           graph_rewriter_fn)