Exemplo n.º 1
0
  def test_save_pipeline_config(self):
    """Tests that the pipeline config is properly saved to disk."""
    pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
    pipeline_config.model.faster_rcnn.num_classes = 10
    pipeline_config.train_config.batch_size = 32
    pipeline_config.train_input_reader.label_map_path = "path/to/label_map"
    pipeline_config.eval_config.num_examples = 20
    pipeline_config.eval_input_reader.queue_capacity = 100

    config_util.save_pipeline_config(pipeline_config, self.get_temp_dir())
    configs = config_util.get_configs_from_pipeline_file(
        os.path.join(self.get_temp_dir(), "pipeline.config"))
    pipeline_config_reconstructed = (
        config_util.create_pipeline_proto_from_configs(configs))

    self.assertEqual(pipeline_config, pipeline_config_reconstructed)
Exemplo n.º 2
0
  def test_create_pipeline_proto_from_configs(self):
    """Tests that proto can be reconstructed from configs dictionary."""
    pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config")

    pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
    pipeline_config.model.faster_rcnn.num_classes = 10
    pipeline_config.train_config.batch_size = 32
    pipeline_config.train_input_reader.label_map_path = "path/to/label_map"
    pipeline_config.eval_config.num_examples = 20
    pipeline_config.eval_input_reader.queue_capacity = 100
    _write_config(pipeline_config, pipeline_config_path)

    configs = config_util.get_configs_from_pipeline_file(pipeline_config_path)
    pipeline_config_reconstructed = (
        config_util.create_pipeline_proto_from_configs(configs))
    self.assertEqual(pipeline_config, pipeline_config_reconstructed)
Exemplo n.º 3
0
    def test_save_pipeline_config(self):
        """Tests that the pipeline config is properly saved to disk."""
        pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
        pipeline_config.model.faster_rcnn.num_classes = 10
        pipeline_config.train_config.batch_size = 32
        pipeline_config.train_input_reader.label_map_path = "path/to/label_map"
        pipeline_config.eval_config.num_examples = 20
        pipeline_config.eval_input_reader.add().queue_capacity = 100

        config_util.save_pipeline_config(pipeline_config, self.get_temp_dir())
        configs = config_util.get_configs_from_pipeline_file(
            os.path.join(self.get_temp_dir(), "pipeline.config"))
        pipeline_config_reconstructed = (
            config_util.create_pipeline_proto_from_configs(configs))

        self.assertEqual(pipeline_config, pipeline_config_reconstructed)
Exemplo n.º 4
0
  def test_create_pipeline_proto_from_configs(self):
    """Tests that proto can be reconstructed from configs dictionary."""
    pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config")

    pipeline_config = pipeline_pb2.TrainEvalPipelineConfig()
    pipeline_config.model.faster_rcnn.num_classes = 10
    pipeline_config.train_config.batch_size = 32
    pipeline_config.train_input_reader.label_map_path = "path/to/label_map"
    pipeline_config.eval_config.num_examples = 20
    pipeline_config.eval_input_reader.add().queue_capacity = 100
    _write_config(pipeline_config, pipeline_config_path)

    configs = config_util.get_configs_from_pipeline_file(pipeline_config_path)
    pipeline_config_reconstructed = (
        config_util.create_pipeline_proto_from_configs(configs))
    self.assertEqual(pipeline_config, pipeline_config_reconstructed)
Exemplo n.º 5
0
def create_model(pipeline_config_path, output_directory, checkpoint_path):
    tf.keras.backend.clear_session()

    print('Building model and restoring weights for fine-tuning...',
          flush=True)
    num_classes = 1
    output_checkpoint_dir = os.path.join(output_directory, 'checkpoint')
    configs = config_util.get_configs_from_pipeline_file(pipeline_config_path)
    model_config = configs['model']
    model_config.ssd.num_classes = num_classes
    model_config.ssd.freeze_batchnorm = True
    detection_model = model_builder.build(model_config=model_config,
                                          is_training=True)
    pipeline_proto = config_util.create_pipeline_proto_from_configs(configs)
    config_util.save_pipeline_config(pipeline_proto, output_directory)

    latest_checkpoint_number = int(checkpoint_path.split('-')[-1])
    print(latest_checkpoint_number)
    if latest_checkpoint_number == 0:
        fake_box_predictor = tf.compat.v2.train.Checkpoint(
            _base_tower_layers_for_heads=detection_model._box_predictor.
            _base_tower_layers_for_heads,
            # _prediction_heads=detection_model._box_predictor._prediction_heads,
            #    (i.e., the classification head that we *will not* restore)
            _box_prediction_head=detection_model._box_predictor.
            _box_prediction_head,
        )
        fake_model = tf.compat.v2.train.Checkpoint(
            _feature_extractor=detection_model._feature_extractor,
            _box_predictor=fake_box_predictor)
        ckpt = tf.compat.v2.train.Checkpoint(model=fake_model)
        ckpt.restore(checkpoint_path).expect_partial()

    exported_ckpt = tf.compat.v2.train.Checkpoint(model=detection_model)
    ckpt_manager = tf.train.CheckpointManager(exported_ckpt,
                                              output_checkpoint_dir,
                                              max_to_keep=1)
    if latest_checkpoint_number > 0:
        status = exported_ckpt.restore(ckpt_manager.latest_checkpoint)

    image, shapes = detection_model.preprocess(tf.zeros([1, 320, 320, 3]))
    prediction_dict = detection_model.predict(image, shapes)
    _ = detection_model.postprocess(prediction_dict, shapes)
    print('Weights restored!')
    return detection_model, pipeline_proto, ckpt_manager
Exemplo n.º 6
0
    def __enter__(self):
        try:
            self._checkpoint_model_folder = ModelZoo.download_model(self._model_name)
            self._checkpoint_model_pipeline_file = ModelZoo.download_pipeline(self._model_name)
            # load pipeline
            if self._pipeline_file.exists():
                configs = get_configs_from_pipeline_file(str(self._pipeline_file))  # load config as a dict
            else:
                configs = get_configs_from_pipeline_file(
                    str(self._checkpoint_model_pipeline_file))  # load config as a dict
            self._pipeline = create_pipeline_proto_from_configs(configs)  # convert to a protobuffer
            # load dataset
            self._load_dataset()
            self._set_config_paths()

            return self
        except Exception as ex:
            raise Exception("Error loading the model : {}".format(ex)) from ex
Exemplo n.º 7
0
def create_pipeline_proto_from_configs(configs):
  """Creates a pipeline_pb2.TrainEvalPipelineConfig from configs dictionary.

  This function nearly performs the inverse operation of
  get_configs_from_pipeline_file(). Instead of returning a file path, it returns
  a `TrainEvalPipelineConfig` object.

  Args:
    configs: Dictionary of configs. See get_configs_from_pipeline_file().

  Returns:
    A fully populated pipeline_pb2.TrainEvalPipelineConfig.
  """
  pipeline_config = config_util.create_pipeline_proto_from_configs(configs)
  if "lstm_model" in configs:
    pipeline_config.Extensions[internal_pipeline_pb2.lstm_model].CopyFrom(
        configs["lstm_model"])
  return pipeline_config
Exemplo n.º 8
0
def create_pipeline_proto_from_configs(configs):
  """Creates a pipeline_pb2.TrainEvalPipelineConfig from configs dictionary.

  This function nearly performs the inverse operation of
  get_configs_from_pipeline_file(). Instead of returning a file path, it returns
  a `TrainEvalPipelineConfig` object.

  Args:
    configs: Dictionary of configs. See get_configs_from_pipeline_file().

  Returns:
    A fully populated pipeline_pb2.TrainEvalPipelineConfig.
  """
  pipeline_config = config_util.create_pipeline_proto_from_configs(configs)
  if "lstm_model" in configs:
    pipeline_config.Extensions[internal_pipeline_pb2.lstm_model].CopyFrom(
        configs["lstm_model"])
  return pipeline_config
Exemplo n.º 9
0
    def get_configuration_content(self, network_info: NetworkInformation) -> str:
        try:
            network_path: str = os.path.join(self.path.weights_dir, network_info.network_architecture,
                                             "pipeline.config")
            config_file_content: Dict[str, str] = get_configs_from_pipeline_file(network_path)
            checkpoint_path = os.path.join(self.path.weights_dir, network_info.network_architecture,
                                           'checkpoint/ckpt-0')
            content: Dict[str, str] = self._adjust_configuration_content(config_file_content=config_file_content,
                                                                         network_path=checkpoint_path)

            # the return of proto dict make error so we save the file and read it with python reader
            pipeline_config = create_pipeline_proto_from_configs(content)
            save_pipeline_config(pipeline_config, "/tmp/")
            content_str: str = open("/tmp/pipeline.config", "r").read()

            return content_str

        except Exception as e:
            raise ConfigurationPipelineNotFound(additional_message=e.__str__(), pipeline_path=network_path)
Exemplo n.º 10
0
 def _set_config_paths(self):
     configs = create_configs_from_pipeline_proto(self.pipeline)
     update_input_reader_config(configs,
                                key_name="train_input_config",
                                input_name=None,
                                field_name="input_path",
                                value=str(self._val_record_file),
                                path_updater=_update_tf_record_input_path)
     update_input_reader_config(configs,
                                key_name="eval_input_configs",
                                input_name=None,
                                field_name="input_path",
                                value=str(self._train_record_file),
                                path_updater=_update_tf_record_input_path)
     update_dict = {
         "label_map_path": str(self._labels_map_file),
         "train_config.fine_tune_checkpoint": str(self._checkpoint_model_folder.joinpath("model.ckpt"))
     }
     configs = merge_external_params_with_configs(configs, kwargs_dict=update_dict)
     self._pipeline = create_pipeline_proto_from_configs(configs)
Exemplo n.º 11
0
def override_pipeline_configs(config_file, overrides, out_dir=""):
    configs = config_util.get_configs_from_pipeline_file(config_file)

    configs['train_config'].from_detection_checkpoint = True
    configs['eval_config'].num_examples = 25000

    for field, value in overrides.items():
        if field == "num_classes":
            set_number_of_classes(configs['model'], value)
        elif field == "width_height":
            set_resizer_width_height(configs['model'], value[0], value[1])
        elif not config_util._maybe_update_config_with_key_value(
                configs, field, value):
            try:
                config_util._update_generic(configs, field, value)
            except ValueError as ex:
                if field == "train_config.fine_tune_checkpoint":
                    configs['train_config'].fine_tune_checkpoint = value
                else:
                    raise

    config_util.save_pipeline_config(
        config_util.create_pipeline_proto_from_configs(configs), out_dir)
Exemplo n.º 12
0
    def build_config_str(self):
        if self.config.nclasses is None:
            raise RequiredConfigMissingError('nclasses must be configured')
        if self.config.record_train_path is None:
            raise RequiredConfigMissingError(
                'record_train_path must be configured')
        if self.config.record_eval_path is None:
            raise RequiredConfigMissingError(
                'record_eval_path must be configured')
        if self.config.labelmap_path is None:
            raise RequiredConfigMissingError(
                'labelmap_path must be configured')
        if self.config.checkpoint_path is None:
            raise RequiredConfigMissingError(
                'checkpoint_path must be configured')
        if self.config.use_checkpoint is None:
            raise RequiredConfigMissingError(
                'use_checkpoint must be configured')

        # TODO: implement augmentation options

        proto = config_util.create_pipeline_proto_from_configs(
            self.pipeline_config)
        return text_format.MessageToString(proto)
def main(_):
    arch_details = arch_map[FLAGS.architecture]
    # check graph type, download graph
    graph_url = arch_details['url']
    graph_path = '/models/research/object_detection/data/'
    maybe_download_and_extract(graph_url, graph_path)
    # Open config file
    config_path = os.path.join(
        '/models/research/object_detection/samples/configs',
        arch_details['config'])
    configs = config_util.get_configs_from_pipeline_file(config_path)
    # Update paths in config
    hparams = tf.contrib.training.HParams(
        label_map_path=FLAGS.label_map_path,
        train_input_path=os.path.join(FLAGS.data_dir, 'train.record'),
        eval_input_path=os.path.join(FLAGS.data_dir, 'val.record'))

    if FLAGS.hparams:
        for key, val in json.loads(FLAGS.hparams).iteritems():
            hparams.add_hparam(key, val)

    config_util.merge_external_params_with_configs(configs, hparams)
    # Save config inside dataset

    configs["train_config"].fine_tune_checkpoint = os.path.join(
        graph_path, arch_details['checkpoint'], 'model.ckpt')

    config_proto = config_util.create_pipeline_proto_from_configs(configs)
    config_str = text_format.MessageToString(config_proto)

    experiment_path = os.path.join(FLAGS.data_dir, FLAGS.experiment_id)
    if not os.path.exists(experiment_path):
        os.makedirs(experiment_path)
    with open(os.path.join(experiment_path, 'pipeline.config'),
              'w') as config_file:
        config_file.write(config_str)
Exemplo n.º 14
0
    def patch_pipeline_config(self, model_base_name):
        self.label_map_path = os.path.join(self.src_train_path,
                                           "label_map.pbtxt")

        model_base_dir_path = os.path.join(self.path_perm_storage,
                                           "model_base_checkpoints",
                                           model_base_name)
        config_path = os.path.join(model_base_dir_path, "pipeline.config")

        cf_dict = config_util.get_configs_from_pipeline_file(config_path)

        cf_dict["model"].ssd.num_classes = self.num_classes

        cf_dict["train_config"].fine_tune_checkpoint = os.path.join(
            model_base_dir_path, "ckpt-0")
        cf_dict["train_config"].batch_size = self.batch_size
        cf_dict["train_config"].use_bfloat16 = False

        cf_dict["train_input_config"].label_map_path = self.label_map_path
        cf_dict[
            "train_input_config"].tf_record_input_reader.input_path[:] = self.scan_dir_for_records(
                DatasetType.training.name)

        cf_dict["eval_input_config"].label_map_path = self.label_map_path
        cf_dict[
            "eval_input_config"].tf_record_input_reader.input_path[:] = self.scan_dir_for_records(
                DatasetType.evaluation.name)

        cf_obj = config_util.create_pipeline_proto_from_configs(cf_dict)
        tmp_config_path = os.path.join(self.path_perm_storage,
                                       "patched_config")
        config_util.save_pipeline_config(cf_obj, tmp_config_path)
        self.patched_config_path = os.path.join(tmp_config_path,
                                                "pipeline.config")
        print("Source configuration was patched: {0}".format(
            self.patched_config_path))
Exemplo n.º 15
0
def edit_config(model_selected,
                config_output_dir,
                num_steps,
                label_map_path,
                record_dir,
                eval_number,
                annotation_type,
                batch_size=None,
                learning_rate=None,
                resizer_size=None):
    '''
        Wrapper to edit the essential values inside the base configuration protobuf file provided with an object-detection/segmentation checkpoint.
        This configuration file is what will entirely define your model, pre-processing, training, evaluation etc. It is the most important file of a model with the checkpoint file and should never be deleted. 
        This is why it is saved in almost every directory where you did something to keep redondancy but also to be sure to have the right config file used at this moment.
        For advanced users, if you want to dwell deep inside the configuration file you should read the proto definitions inside the proto directory of the object-detection API.

        Args: 
            Required:
                model_selected: The checkpoint you want to resume from.
                config_output_dir: The path where you want to save your edited protobuf configuration file.
                num_steps: The number of steps you want to train on.
                label_map_path: The path to your label_map.pbtxt file.
                record_dir: The path to the directory where your TFRecord files are saved.
                eval_number: The number of images you want to evaluate on.
                annotation_type: Should be either "rectangle" or "polygon", depending on how you annotated your images.

            Optional:
                batch_size: The batch size you want to use. If not provided it will use the previous one. 
                learning_rate: The learning rate you want to use for the training. If not provided it will use the previous one. 
                                Please see config_utils.update_initial_learning_rate() inside the object_detection folder for indepth details on what happens when updating it.
                resizer_size: The shape used to update your image resizer. Please see set_image_resizer() for more details on this. If not provided it will use the previous one.            

    '''

    file_list = os.listdir(model_selected)
    ckpt_ids = []
    for p in file_list:
        if "index" in p:
            if "-" in p:
                ckpt_ids.append(int(p.split('-')[1].split('.')[0]))
    if len(ckpt_ids) > 0:
        ckpt_path = os.path.join(model_selected,
                                 "model.ckpt-{}".format(str(max(ckpt_ids))))

    else:
        ckpt_path = os.path.join(model_selected, "model.ckpt")

    configs = config_util.get_configs_from_pipeline_file(
        os.path.join(model_selected, 'pipeline.config'))
    label_map = label_map_util.load_labelmap(label_map_path)

    config_util._update_train_steps(configs, num_steps)
    update_different_paths(
        configs,
        ckpt_path=ckpt_path,
        label_map_path=label_map_path,
        train_record_path=os.path.join(record_dir, "train.record"),
        eval_record_path=os.path.join(record_dir, "eval.record"))

    if learning_rate is not None:
        config_util._update_initial_learning_rate(configs, learning_rate)

    if batch_size is not None:
        config_util._update_batch_size(configs, batch_size)

    if annotation_type == "polygon":
        edit_masks(configs, mask_type="PNG_MASKS")

    if resizer_size is not None:
        set_image_resizer(configs, resizer_size)

    edit_eval_config(configs, annotation_type, eval_number)
    update_num_classes(configs, label_map)
    config_proto = config_util.create_pipeline_proto_from_configs(configs)
    config_util.save_pipeline_config(config_proto, directory=config_output_dir)
Exemplo n.º 16
0
def populate_experiment(run_config,
                        hparams,
                        pipeline_config_path,
                        train_steps=None,
                        eval_steps=None,
                        model_fn_creator=create_model_fn,
                        **kwargs):
  """Populates an `Experiment` object.

  Args:
    run_config: A `RunConfig`.
    hparams: A `HParams`.
    pipeline_config_path: A path to a pipeline config file.
    train_steps: Number of training steps. If None, the number of training steps
      is set from the `TrainConfig` proto.
    eval_steps: Number of evaluation steps per evaluation cycle. If None, the
      number of evaluation steps is set from the `EvalConfig` proto.
    model_fn_creator: A function that creates a `model_fn` for `Estimator`.
      Follows the signature:

      * Args:
        * `detection_model_fn`: Function that returns `DetectionModel` instance.
        * `configs`: Dictionary of pipeline config objects.
        * `hparams`: `HParams` object.
      * Returns:
        `model_fn` for `Estimator`.

    **kwargs: Additional keyword arguments for configuration override.

  Returns:
    An `Experiment` that defines all aspects of training, evaluation, and
    export.
  """
  configs = config_util.get_configs_from_pipeline_file(pipeline_config_path)
  configs = config_util.merge_external_params_with_configs(
      configs,
      hparams,
      train_steps=train_steps,
      eval_steps=eval_steps,
      **kwargs)
  model_config = configs['model']
  train_config = configs['train_config']
  train_input_config = configs['train_input_config']
  eval_config = configs['eval_config']
  eval_input_config = configs['eval_input_config']

  if train_steps is None:
    train_steps = train_config.num_steps if train_config.num_steps else None

  if eval_steps is None:
    eval_steps = eval_config.num_examples if eval_config.num_examples else None

  detection_model_fn = functools.partial(
      model_builder.build, model_config=model_config)

  # Create the input functions for TRAIN/EVAL.
  train_input_fn = inputs.create_train_input_fn(
      train_config=train_config,
      train_input_config=train_input_config,
      model_config=model_config)
  eval_input_fn = inputs.create_eval_input_fn(
      eval_config=eval_config,
      eval_input_config=eval_input_config,
      model_config=model_config)

  export_strategies = [
      tf.contrib.learn.utils.saved_model_export_utils.make_export_strategy(
          serving_input_fn=inputs.create_predict_input_fn(
              model_config=model_config))
  ]

  estimator = tf.estimator.Estimator(
      model_fn=model_fn_creator(detection_model_fn, configs, hparams),
      config=run_config)

  if run_config.is_chief:
    # Store the final pipeline config for traceability.
    pipeline_config_final = config_util.create_pipeline_proto_from_configs(
        configs)
    pipeline_config_final_path = os.path.join(estimator.model_dir,
                                              'pipeline.config')
    config_text = text_format.MessageToString(pipeline_config_final)
    with tf.gfile.Open(pipeline_config_final_path, 'wb') as f:
      tf.logging.info('Writing as-run pipeline config file to %s',
                      pipeline_config_final_path)
      f.write(config_text)

  return tf.contrib.learn.Experiment(
      estimator=estimator,
      train_input_fn=train_input_fn,
      eval_input_fn=eval_input_fn,
      train_steps=train_steps,
      eval_steps=eval_steps,
      export_strategies=export_strategies,
      eval_delay_secs=120,)
Exemplo n.º 17
0
import tensorflow as tf

# Assuming object detection API is available for use
from object_detection.utils.config_util import create_pipeline_proto_from_configs
from object_detection.utils.config_util import get_configs_from_pipeline_file
import export

# Configuration for model to be exported
config_pathname = 'training/faster_rcnn_inception_v2_pets.config'

# Input checkpoint for the model to be exported
# Path to the directory which consists of the saved model on disk (see above)
trained_model_dir = '/home/rice/PycharmProjects/mytensorflow_sample/MODEL/'

# Create proto from model confguration
configs = get_configs_from_pipeline_file(config_pathname)
pipeline_proto = create_pipeline_proto_from_configs(configs=configs)

# Read .ckpt and .meta files from model directory
checkpoint = tf.train.get_checkpoint_state(trained_model_dir)
input_checkpoint = checkpoint.model_checkpoint_path

# Model Version
model_version_id = '1'

# Output Directory
output_directory = '/home/rice/tensorflow1/models/research/object_detection/' + str(model_version_id)

# Export model for serving
export.export_inference_graph(input_type='image_tensor',pipeline_config=pipeline_proto,trained_checkpoint_prefix=input_checkpoint,output_directory=output_directory)
Exemplo n.º 18
0
    def start_training(self):
        """Start training for the model"""
        worker_replicas = 1
        ps_tasks = 0
        clone_on_cpu = False
        num_clones = 1

        ensure_path(config.BASE_MODELS_PATH)
        train_dir = self.train_dir
        model_json_path = os.path.join(train_dir, 'job.json')

        job = self.job
        num_steps = int(job['steps'])

        try:
            if config.DEBUG:
                num_steps = 50
        except AttributeError:
            pass
        except Exception as e:
            _LOGGER.error(e)

        job = api.update_job_state(job, 'training', 'Start training for {} steps'.format(num_steps))

        model = self.model
        ensure_path(config.EXPORTED_MODELS)
        model_graph = os.path.join(config.EXPORTED_MODELS, '{}.pb'.format(model['file_name']))

        if not os.path.exists(os.path.join(train_dir, 'checkpoint')):  # New training started
            _LOGGER.debug("Checkpoints doesn't exists")

            base_checkpoints_path = os.path.join(config.BASE_MODELS_PATH, model['architecture'])
            _tmf = os.path.join(config.TRAINED_MODELS_DATA, model['file_name'])
            if os.path.isdir(_tmf):
                _LOGGER.debug("Model already exists as %s" % model_graph)
                base_checkpoints_path = _tmf
            elif model['type'] == 'new':
                _LOGGER.debug("model type new")
            else:
                _LOGGER.debug("New model from parent model")
                parent_model = api.get_model(model['parent'])
                if not parent_model:
                    raise Exception('Parent model not found on server')

                parent_tmf = os.path.join(config.TRAINED_MODELS_DATA, parent_model['file_name'])
                if os.path.isdir(parent_tmf):
                    base_checkpoints_path = parent_tmf
                else:
                    _LOGGER.error("Parent model not found. please train it first")
                    return False

            if not os.path.exists(os.path.join(base_checkpoints_path, 'model.ckpt.meta')):
                _LOGGER.debug("Base model not found for %s, Downloading now." % model['architecture'])
                _f = api.download_model_files(model['architecture'])

                tmp_model_data = os.path.join(config.DATA_DIR, 'tmp_model_data')
                if tarfile.is_tarfile(_f):
                    if os.path.exists(tmp_model_data):
                        shutil.rmtree(tmp_model_data)
                    ensure_path(tmp_model_data)
                    print("Tar file found")
                    shutil.unpack_archive(_f, tmp_model_data)
                    for root, dirs, files in os.walk(tmp_model_data):
                        for file in files:
                            if 'model.ckpt' in file:
                                path = os.path.join(root, file)
                                # print(path)
                                ensure_path(base_checkpoints_path)
                                shutil.copy(path, os.path.join(base_checkpoints_path, file))
                else:
                    _LOGGER.error("Invalid file")
                    return False
            if os.path.exists(train_dir):
                shutil.rmtree(train_dir)
            shutil.copytree(base_checkpoints_path, train_dir)
            if os.path.exists(os.path.join(train_dir, 'checkpoint')):
                os.remove(os.path.join(train_dir, 'checkpoint'))

        if os.path.exists(os.path.join(train_dir, 'data')):
            shutil.rmtree(os.path.join(train_dir, 'data'))
        shutil.copytree(self.data_dir, os.path.join(train_dir, 'data'))

        counts = {'train': 0, 'test': 1000, 'classes': 1}
        stats_file = os.path.join(train_dir, "data", "stats.json")
        try:
            with open(stats_file) as _f:
                counts = json.load(_f)
        except:
            pass

        pipeline_config_path = os.path.join(train_dir, 'pipeline.config')
        if not os.path.exists(pipeline_config_path):
            pipeline_config_path = os.path.join(self.configs_dir, "{}.config".format(model['architecture']))
        task = '0'
        if task == '0':
            tf.gfile.MakeDirs(train_dir)
        if pipeline_config_path:
            _LOGGER.info("Pipeline config file : {}".format(pipeline_config_path))
            configs = config_util.get_configs_from_pipeline_file(
                pipeline_config_path)
            if task == '0':
                tf.gfile.Copy(pipeline_config_path,
                              os.path.join(train_dir, 'pipeline.config'),
                              overwrite=True)
        else:
            _LOGGER.error("No config found")
            return False

        pipeline_config_path = os.path.join(train_dir, 'pipeline.config')

        # with open(model_json_path, 'w') as mf:
        #     json.dump(job, mf)

        model_config = configs['model']
        train_config = configs['train_config']
        input_config = configs['train_input_config']


        if model_config.HasField('faster_rcnn'):
            model_config.faster_rcnn.num_classes = counts['classes']

        if model_config.HasField('ssd'):
            model_config.ssd.num_classes = counts['classes']

        # Set num_steps
        train_config.num_steps = num_steps
        train_config.fine_tune_checkpoint = os.path.join(train_dir, 'model.ckpt')

        # Update input config to use updated list of input
        input_config.tf_record_input_reader.ClearField('input_path')
        input_config.tf_record_input_reader.input_path.append(os.path.join(train_dir, 'data', "train_baheads.tfrecord-??????"))
        input_config.label_map_path = os.path.join(train_dir, 'data', "labels.pbtxt")

        eval_config = configs['eval_config']
        eval_input_config = configs['eval_input_config']

        eval_config.num_examples = counts['test']
        eval_config.max_evals = 1

        # Update input config to use updated list of input
        eval_input_config.tf_record_input_reader.ClearField('input_path')
        eval_input_config.tf_record_input_reader.input_path.append(os.path.join(train_dir, 'data', "test_baheads.tfrecord-??????"))
        eval_input_config.label_map_path = os.path.join(train_dir, 'data', "labels.pbtxt")

        # Save the updated config to pipeline file
        config_util.save_pipeline_config(config_util.create_pipeline_proto_from_configs({
            'model': model_config,
            'train_config': train_config,
            'train_input_config': input_config,
            'eval_config': eval_config,
            'eval_input_config': eval_input_config

        }), train_dir)

        model_fn = functools.partial(
            model_builder.build,
            model_config=model_config,
            is_training=True)

        def get_next(config):
            return dataset_builder.make_initializable_iterator(
                dataset_builder.build(config)).get_next()

        create_input_dict_fn = functools.partial(get_next, input_config)

        env = json.loads(os.environ.get('TF_CONFIG', '{}'))
        cluster_data = env.get('cluster', None)
        cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
        task_data = env.get('task', None) or {'type': 'master', 'index': 0}
        task_info = type('TaskSpec', (object,), task_data)

        # Parameters for a single worker.
        ps_tasks = 0
        worker_replicas = 1
        worker_job_name = 'lonely_worker'
        task = 0
        is_chief = True
        master = ''

        if cluster_data and 'worker' in cluster_data:
            # Number of total worker replicas include "worker"s and the "master".
            worker_replicas = len(cluster_data['worker']) + 1
        if cluster_data and 'ps' in cluster_data:
            ps_tasks = len(cluster_data['ps'])

        if worker_replicas > 1 and ps_tasks < 1:
            raise ValueError('At least 1 ps task is needed for distributed training.')

        if worker_replicas >= 1 and ps_tasks > 0:
            # Set up distributed training.
            server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc',
                                     job_name=task_info.type,
                                     task_index=task_info.index)
            if task_info.type == 'ps':
                server.join()
                return

            worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
            task = task_info.index
            is_chief = (task_info.type == 'master')
            master = server.target

        graph_rewriter_fn = None
        if 'graph_rewriter_config' in configs:
            graph_rewriter_fn = graph_rewriter_builder.build(
                configs['graph_rewriter_config'], is_training=True)

        if not os.path.exists(os.path.join(train_dir, 'model.ckpt-{}.meta'.format(num_steps))):
            status_timer = StatusThread(tfh, num_steps, job)
            status_timer.start()
            try:
                trainer.train(
                    create_input_dict_fn,
                    model_fn,
                    train_config,
                    master,
                    task,
                    num_clones,
                    worker_replicas,
                    clone_on_cpu,
                    ps_tasks,
                    worker_job_name,
                    is_chief,
                    train_dir,
                    graph_hook_fn=graph_rewriter_fn)
            except KeyboardInterrupt:
                raise
            finally:
                status_timer.stop()
                if status_timer.is_alive():
                    _LOGGER.info("Waiting for status thread to close")
                    status_timer.join()

        if os.path.exists(os.path.join(train_dir, 'model.ckpt-{}.meta'.format(num_steps))):
            # Training complete. Export model
            _LOGGER.debug("Training complete for %d steps" % num_steps)
            job = api.update_job_state(job, 'training', 'Training complete')
            export_path = os.path.join(config.TRAINED_MODELS_DATA, model['file_name'])
            if os.path.exists(export_path):
                shutil.rmtree(export_path)
            ckpt_path = os.path.join(train_dir, 'model.ckpt-{}'.format(num_steps))
            exporter.export(pipeline_config_path, export_path, ckpt_path)

            frozen_graph = os.path.join(export_path, 'frozen_inference_graph.pb')

            if os.path.exists(frozen_graph):  # Successfully exported
                shutil.copy(frozen_graph, model_graph)
                shutil.copy(
                    os.path.join(train_dir, 'data', "labels.pbtxt"),
                    os.path.join(config.EXPORTED_MODELS, '{}.pbtxt'.format(model['file_name']))
                )
                # TODO: Eval the trained graph, Push the result to server.
                eval_dir = 'eval_dir'
                tf.reset_default_graph()
                eval_result = run_eval(train_dir, eval_dir, pipeline_config_path, counts['test'])
                if 'PascalBoxes_Precision/[email protected]' in eval_result:
                    acc = eval_result['PascalBoxes_Precision/[email protected]'] * 100
                    _LOGGER.info("PascalBoxes_Precision/[email protected] : %d %%" % (acc))
                    job = api.update_job_state(job, 'complete', 'PascalBoxes_Precision %d %%' % (acc))
                _LOGGER.info(eval_result)
                if os.path.exists(train_dir):
                    shutil.rmtree(train_dir)
                return True

        return False
Exemplo n.º 19
0
 def save_pipeline(pipeline_dict, out_folder):
     pipeline_proto = create_pipeline_proto_from_configs(pipeline_dict)
     save_pipeline_config(pipeline_proto, out_folder)
Exemplo n.º 20
0
        num_classes,
        'train_config.fine_tune_checkpoint':
        os.path.join(os.getcwd(), 'pretrained_models', model_name,
                     'model.ckpt'),
        'label_map_path':
        label_pbtxt,
        'train_input_path':
        train_record,
        'eval_input_path':
        eval_record,
    }
    # 更新config文件
    tf.logging.info("Updata config file {}".format(pipeline_config))
    config = config_util.merge_external_params_with_configs(
        config, kwargs_dict=config_updata)
    config = config_util.create_pipeline_proto_from_configs(config)
    with tf.gfile.Open(pipeline_config, "wb") as f:
        f.write(text_format.MessageToString(config))

# 训练结果保存位置
save_path = os.path.join(os.getcwd(), 'weights', model_name)
if not os.path.exists(save_path):
    os.mkdir(save_path)


def main(unused_argv):
    train_and_eval_dict = model_lib.create_estimator_and_inputs(
        run_config=tf.estimator.RunConfig(model_dir=save_path),
        hparams=model_hparams.create_hparams(None),
        pipeline_config_path=pipeline_config,
        train_steps=None,
def update_pipeline_config(params, eval_type):
    cfg = config_util.get_configs_from_pipeline_file(
        os.path.join(params.config_mnt, params.config_dir))
    # update num_of_classes
    model_name = os.path.basename(
        os.path.normpath(os.path.join(params.config_mnt,
                                      params.config_dir))).lower()
    print("model name: ", model_name)
    if model_name.startswith("ssd"):
        model_cfg = cfg['model'].ssd
    elif model_name.startswith("faster_rcnn"):
        model_cfg = cfg['model'].faster_rcnn
    else:
        raise ValueError(
            'unknown base model {}, we can only handle ssd or faster_rcnn'.
            format(model_name))

    label_map = os.path.join(params.config_mnt, params.label_dir)
    label_map_dict = label_map_util.get_label_map_dict(label_map)
    num_classes = len(label_map_dict)
    model_cfg.num_classes = num_classes

    # update base_model_dir
    train_cfg = cfg['train_config']
    train_cfg.fine_tune_checkpoint = os.path.join(params.config_mnt,
                                                  params.transfer_learning_dir,
                                                  'model.ckpt')
    eval_cfg = cfg['eval_config']
    eval_cfg.max_evals = 1
    eval_cfg.num_examples = int(params.eval_num_examples)

    # update num_train_steps, label_map_path, train_tfrecords, val_tfrecords, batch size\
    print(
        os.path.join(os.path.sep, params.base_mnt, params.source_data_name,
                     'tf_records', 'train.record'))
    hparams = tf.contrib.training.HParams(
        batch_size=int(params.batch_size),
        train_steps=int(params.num_steps),
        label_map_path=label_map,
        train_input_path=os.path.join(os.path.sep, params.base_mnt,
                                      params.source_data_name, 'tf_records',
                                      'train.record'),
        eval_input_path=os.path.join(os.path.sep, params.base_mnt,
                                     params.source_data_name, 'tf_records',
                                     eval_type + '.record'),
    )
    cfg = config_util.merge_external_params_with_configs(cfg, hparams)
    # log metrics
    run_context = Run.get_context()
    run_context.log("Batch Size", int(params.batch_size))
    run_context.log("Training Steps", int(params.num_steps))
    # run.log("Maximum Evaluations",max_evals)

    updated_pipeline_config = config_util.create_pipeline_proto_from_configs(
        cfg)
    print("updated_pipeline_config: ", updated_pipeline_config)
    updated_pipeline_config_file = os.path.join(params.config_mnt,
                                                params.config_dir)
    print("updated_pipeline_config_file: ", updated_pipeline_config_file)
    print("dir name: ",
          os.path.dirname(os.path.join(params.config_mnt, params.config_dir)))
    config_util.save_pipeline_config(
        updated_pipeline_config,
        os.path.join(params.base_mnt, params.source_data_name, 'model_config'))
    return updated_pipeline_config, updated_pipeline_config_file
Exemplo n.º 22
0
import tensorflow as tf

# Assuming object detection API is available for use
from object_detection.utils.config_util import create_pipeline_proto_from_configs
from object_detection.utils.config_util import get_configs_from_pipeline_file
import object_detection.exporter

# Configuration for model to be exported
config_pathname = '/home/stash/projects/aadhar_identification/models/research/object_detection/samples/configs/faster_rcnn_resnet101_pets.config'

# Input checkpoint for the model to be exported
# Path to the directory which consists of the saved model on disk (see above)
trained_model_dir = '/home/stash/projects/aadhar_identification/models/research/exported_graphs_171020181048'
# Create proto from model confguration
configs = get_configs_from_pipeline_file(config_pathname)
pipeline_proto = create_pipeline_proto_from_configs(configs=configs)

# Read .ckpt and .meta files from model directory
checkpoint = tf.train.get_checkpoint_state(trained_model_dir)
input_checkpoint = checkpoint.model_checkpoint_path

# Model Version
model_version_id = 1
# Output Directory
output_directory = '/home/stash/projects/aadhar_identification/models/research/exported_graphs_301020181727' + str(model_version_id)

# Export model for serving
object_detection.exporter.export_inference_graph(input_type='image_tensor',pipeline_config=pipeline_proto,trained_checkpoint_prefix=input_checkpoint,output_directory=output_directory)
Exemplo n.º 23
0
def eval_continuously(
        pipeline_config_path,
        config_override=None,
        train_steps=None,
        sample_1_of_n_eval_examples=1,
        sample_1_of_n_eval_on_train_examples=1,
        use_tpu=False,
        override_eval_num_epochs=True,
        postprocess_on_cpu=False,
        model_dir=None,
        checkpoint_dir=None,
        wait_interval=180,
        timeout=3600,
        eval_index=0,
        save_final_config=False,
        **kwargs):
    """Run continuous evaluation of a detection model eagerly.
    This method builds the model, and continously restores it from the most
    recent training checkpoint in the checkpoint directory & evaluates it
    on the evaluation data.
    Args:
      pipeline_config_path: A path to a pipeline config file.
      config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to
        override the config from `pipeline_config_path`.
      train_steps: Number of training steps. If None, the number of training steps
        is set from the `TrainConfig` proto.
      sample_1_of_n_eval_examples: Integer representing how often an eval example
        should be sampled. If 1, will sample all examples.
      sample_1_of_n_eval_on_train_examples: Similar to
        `sample_1_of_n_eval_examples`, except controls the sampling of training
        data for evaluation.
      use_tpu: Boolean, whether training and evaluation should run on TPU.
      override_eval_num_epochs: Whether to overwrite the number of epochs to 1 for
        eval_input.
      postprocess_on_cpu: When use_tpu and postprocess_on_cpu are true,
        postprocess is scheduled on the host cpu.
      model_dir: Directory to output resulting evaluation summaries to.
      checkpoint_dir: Directory that contains the training checkpoints.
      wait_interval: The mimmum number of seconds to wait before checking for a
        new checkpoint.
      timeout: The maximum number of seconds to wait for a checkpoint. Execution
        will terminate if no new checkpoints are found after these many seconds.
      eval_index: int, If given, only evaluate the dataset at the given
        index. By default, evaluates dataset at 0'th index.
      save_final_config: Whether to save the pipeline config file to the model
        directory.
      **kwargs: Additional keyword arguments for configuration override.
    """
    config_override = None
    configs = config_util.get_configs_from_pipeline_file(
        pipeline_config_path, config_override=config_override)

    # get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[
    #     'get_configs_from_pipeline_file']
    # create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[
    #     'create_pipeline_proto_from_configs']
    # merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[
    #     'merge_external_params_with_configs']

    # configs = get_configs_from_pipeline_file(
    #     pipeline_config_path, config_override=config_override)
    kwargs.update({
        'sample_1_of_n_eval_examples': sample_1_of_n_eval_examples,
        'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu
    })
    if train_steps is not None:
        kwargs['train_steps'] = train_steps
    if override_eval_num_epochs:
        kwargs.update({'eval_num_epochs': 1})
        # tf.logging.warning(
        #     'Forced number of epochs for all eval validations to be 1.')
    configs = config_util.merge_external_params_with_configs(
        configs, None, kwargs_dict=kwargs)
    if model_dir and save_final_config:
        # tf.logging.info('Saving pipeline config file to directory {}'.format(
        #     model_dir))
        pipeline_config_final = config_util.create_pipeline_proto_from_configs(
            configs)
        config_util.save_pipeline_config(pipeline_config_final, model_dir)

    model_config = configs['model']
    train_input_config = configs['train_input_config']
    eval_config = configs['eval_config']
    eval_input_configs = configs['eval_input_configs']
    eval_on_train_input_config = copy.deepcopy(train_input_config)
    eval_on_train_input_config.sample_1_of_n_examples = (
        sample_1_of_n_eval_on_train_examples)
    if override_eval_num_epochs and eval_on_train_input_config.num_epochs != 1:
        # tf.logging.warning('Expected number of evaluation epochs is 1, but '
        #                    'instead encountered `eval_on_train_input_config'
        #                    '.num_epochs` = '
        #                    '{}. Overwriting `num_epochs` to 1.'.format(
        #                        eval_on_train_input_config.num_epochs))
        eval_on_train_input_config.num_epochs = 1

    if kwargs['use_bfloat16']:
        tf.compat.v2.keras.mixed_precision.set_global_policy('mixed_bfloat16')

    eval_input_config = eval_input_configs[eval_index]
    strategy = tf.compat.v2.distribute.get_strategy()
    with strategy.scope():
        detection_model = model_builder.build(
            model_config=model_config, is_training=True)
        # detection_model = MODEL_BUILD_UTIL_MAP['detection_model_fn_base'](
        #     model_config=model_config, is_training=True)

    eval_input = strategy.experimental_distribute_dataset(
        inputs.eval_input(
            eval_config=eval_config,
            eval_input_config=eval_input_config,
            model_config=model_config,
            model=detection_model))

    global_step = tf.compat.v2.Variable(
        0, trainable=False, dtype=tf.compat.v2.dtypes.int64)

    optimizer, _ = optimizer_builder.build(
        configs['train_config'].optimizer, global_step=global_step)

    for latest_checkpoint in tf.train.checkpoints_iterator(
            checkpoint_dir, timeout=timeout, min_interval_secs=wait_interval):
        ckpt = tf.compat.v2.train.Checkpoint(
            step=global_step, model=detection_model, optimizer=optimizer)

        # We run the detection_model on dummy inputs in order to ensure that the
        # model and all its variables have been properly constructed. Specifically,
        # this is currently necessary prior to (potentially) creating shadow copies
        # of the model variables for the EMA optimizer.
        if eval_config.use_moving_averages:
            unpad_groundtruth_tensors = (
                eval_config.batch_size == 1 and not use_tpu)
            _ensure_model_is_built(detection_model, eval_input,
                                   unpad_groundtruth_tensors)
            optimizer.shadow_copy(detection_model)

        ckpt.restore(latest_checkpoint).expect_partial()

        if eval_config.use_moving_averages:
            optimizer.swap_weights()

        summary_writer = tf.compat.v2.summary.create_file_writer(
            os.path.join(model_dir, 'eval', eval_input_config.name))
        with summary_writer.as_default():
            eager_eval_loop(
                detection_model,
                configs,
                eval_input,
                use_tpu=use_tpu,
                postprocess_on_cpu=postprocess_on_cpu,
                global_step=global_step,
            )
Exemplo n.º 24
0
def train_loop(config_path: str,
               model_dir: str,
               config_override: Optional[
                   pipeline_pb2.TrainEvalPipelineConfig] = None,
               train_steps: Optional[int] = None,
               use_tpu: bool = False,
               save_final_config: bool = False,
               log_every_n: int = 100,
               ckpt_every_n: int = 1000,
               ckpt_max_to_keep: int = 7,
               record_summaries: bool = True,
               **kwargs) -> None:
    """Trains a model using eager + functions.
    
    This method:
    1. Processes the pipeline configs
    2. (Optionally) saves the as-run config
    3. Builds the model & optimizer
    4. Gets the training input data
    5. Loads a fine-tuning detection or classification checkpoint if requested
    6. Loops over the train data, executing distributed training steps inside tf.functions.
    7. Checkpoints the model every `ckpt_every_n` training steps.
    8. Logs the training metrics as TensorBoard summaries.
    
    Args:
        config_path: A path to a pipeline config file.
        model_dir: The directory to save checkpoints and summaries to.
        config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to override the config from `config_path`.
        train_steps: Number of training steps. If None, training steps from `TrainConfig` proto will be adopted.
        use_tpu: Boolean, whether training and evaluation should run on TPU.
        save_final_config: Whether to save final config (obtained after applying overrides) to `model_dir`.
        log_every_n: Log total loss every n training steps.
        ckpt_every_n: Checkpoint every n training steps.
        ckpt_max_to_keep: int, the number of most recent checkpoints to keep in the model directory.
        record_summaries: Boolean, whether or not to record summaries.
        **kwargs: Additional keyword arguments for configuration override.
    """

    # parse config
    configs = config_util.get_configs_from_pipeline_file(
        config_path, config_override=config_override)
    kwargs.update({
        'train_steps':
        train_steps,
        'use_bfloat16':
        configs['train_config'].use_bfloat16 and use_tpu,
    })
    configs = config_util.merge_external_params_with_configs(
        configs, None, kwargs_dict=kwargs)

    model_config = configs['model']
    train_config = configs['train_config']
    train_input_config = configs['train_input_config']

    unpad_gt_tensors = train_config.unpad_groundtruth_tensors
    add_regularization_loss = train_config.add_regularization_loss
    clip_gradient_norm = None

    if train_config.gradient_clipping_by_norm > 0:
        clip_gradient_norm = train_config.gradient_clipping_by_norm

    if kwargs['use_bfloat16']:
        tf.keras.mixed_precision.experimental.set_policy('mixed_bfloat16')

    if train_config.load_all_detection_checkpoint_vars:
        raise ValueError(
            'train_pb2.load_all_detection_checkpoint_vars unsupported in TF2')

    # base checkpoint to fine-tune from
    config_util.update_fine_tune_checkpoint_type(train_config)
    base_ckpt = train_config.fine_tune_checkpoint
    base_ckpt_type = train_config.fine_tune_checkpoint_type
    base_ckpt_ver = train_config.fine_tune_checkpoint_version

    # write the as-run pipeline config to disk
    if save_final_config:
        pipeline_config_final = config_util.create_pipeline_proto_from_configs(
            configs)
        config_util.save_pipeline_config(pipeline_config_final, model_dir)

    # build model, input, optimizer
    strategy = tf.distribute.get_strategy()
    with strategy.scope():
        # build model
        model = model_builder.build(model_config=model_config,
                                    is_training=True)

        # build input
        def train_dataset_fn(
                input_context: tf.distribute.InputContext) -> tf.data.Dataset:
            """Callable to create train input."""
            train_input = inputs.train_input(
                train_config=train_config,
                train_input_config=train_input_config,
                model_config=model_config,
                model=model,
                input_context=input_context,
            )
            train_input = train_input.repeat()

            return train_input

        train_input = strategy.experimental_distribute_datasets_from_function(
            train_dataset_fn)

        # build optimizer
        global_step = tf.Variable(
            0,
            trainable=False,
            dtype=tf.int64,
            name='global_step',
            aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
        )
        optimizer, (learning_rate, ) = optimizer_builder.build(
            train_config.optimizer, global_step=global_step)

        if callable(learning_rate):
            learning_rate_fn = learning_rate
        else:
            learning_rate_fn = lambda: learning_rate

    # prepare for training

    # get appropriate filepath (temporary or not) based on whether the worker is the chief
    summary_log_path = get_filepath(strategy, os.path.join(model_dir, 'train'))

    if record_summaries:
        summary_writer = tf.summary.create_file_writer(summary_log_path)
    else:
        summary_writer = tf.summary.create_noop_writer()

    if use_tpu:
        num_steps_per_iteration = 100
    else:
        num_steps_per_iteration = 1

    with summary_writer.as_default():
        with strategy.scope():
            with tf.summary.record_if(
                    lambda: global_step % num_steps_per_iteration == 0):
                # prepare checkpoint manager
                # (do not use manager.latest_checkpoint as manager_dir is not model_dir while running in worker)
                ckpt = tf.train.Checkpoint(model=model,
                                           step=global_step,
                                           optimizer=optimizer)
                ckpt_max_to_keep = ckpt_max_to_keep if strategy.extended.should_checkpoint else 1
                manager_dir = get_filepath(strategy, model_dir)
                manager = tf.train.CheckpointManager(
                    ckpt, manager_dir, max_to_keep=ckpt_max_to_keep)
                latest_ckpt = tf.train.latest_checkpoint(model_dir)

                if latest_ckpt:
                    # load latest checkpoint being trained
                    ckpt.restore(latest_ckpt).expect_partial()
                elif base_ckpt:
                    # load a pre-trained checkpoint
                    load_base_ckpt(model, base_ckpt, base_ckpt_type,
                                   base_ckpt_ver, train_input,
                                   unpad_gt_tensors)

                # get trainable variables
                train_vars = get_train_vars(model, train_config)

                # define training step
                def train_step_fn(features: Dict, labels: Dict):
                    """Single train step."""
                    loss = eager_train_step(
                        model,
                        train_vars,
                        features,
                        labels,
                        unpad_gt_tensors,
                        optimizer,
                        learning_rate=learning_rate_fn(),
                        add_regularization_loss=add_regularization_loss,
                        clip_gradient_norm=clip_gradient_norm,
                        global_step=global_step,
                        num_replicas=strategy.num_replicas_in_sync,
                    )
                    global_step.assign_add(1)

                    return loss

                def _sample_and_train(strategy, train_step_fn, data_iterator):
                    features, labels = data_iterator.next()
                    per_replica_losses = strategy.run(train_step_fn,
                                                      args=(features, labels))

                    return strategy.reduce(tf.distribute.ReduceOp.SUM,
                                           per_replica_losses,
                                           axis=None)

                @tf.function
                def _dist_train_step(data_iterator):
                    """A distributed train step."""
                    if num_steps_per_iteration > 1:
                        for _ in tf.range(num_steps_per_iteration - 1):
                            with tf.name_scope(''):
                                _sample_and_train(strategy, train_step_fn,
                                                  data_iterator)

                    return _sample_and_train(strategy, train_step_fn,
                                             data_iterator)

                train_input_iter = iter(train_input)

                # save initialized version of checkpoint
                if int(global_step.value()) == 0:
                    manager.save()

                ckpt_step = int(global_step.value())
                logged_step = global_step.value()

                # proceed with training
                last_step_time = time.time()
                for _ in range(global_step.value(), train_config.num_steps,
                               num_steps_per_iteration):
                    # execute a step (forward pass + backward pass)
                    loss = _dist_train_step(train_input_iter)

                    # log time
                    curr_step = global_step.value()
                    time_taken = time.time() - last_step_time
                    last_step_time = time.time()

                    tf.summary.scalar(
                        'steps_per_sec',
                        num_steps_per_iteration * 1.0 / time_taken,
                        step=global_step,
                    )

                    # log loss
                    if curr_step - logged_step >= log_every_n:
                        step_time = time_taken / num_steps_per_iteration
                        step_msg = 'Step {} per-step time {:.3f}s loss={:.3f}'.format(
                            curr_step, step_time, loss)
                        v1.logging.info(step_msg)
                        logged_step = curr_step

                    # save checkpoint regularly
                    if (curr_step - ckpt_step) >= ckpt_every_n:
                        manager.save()
                        ckpt_step = curr_step

    # remove checkpoint directories of non-chief workers that MultiWorkerMirroredStrategy forces us to save during sync
    # distributed training.
    clean_temporary_directories(strategy, manager_dir)
    clean_temporary_directories(strategy, summary_log_path)
Exemplo n.º 25
0
def populate_experiment(run_config,
                        hparams,
                        pipeline_config_path,
                        train_steps=None,
                        eval_steps=None,
                        model_fn_creator=create_model_fn,
                        **kwargs):

  configs = config_util.get_configs_from_pipeline_file(pipeline_config_path)
  configs = config_util.merge_external_params_with_configs(
      configs,
      hparams,
      train_steps=train_steps,
      eval_steps=eval_steps,
      **kwargs)
  model_config = configs['model']
  train_config = configs['train_config']
  train_input_config = configs['train_input_config']
  eval_config = configs['eval_config']
  eval_input_config = configs['eval_input_config']

  if train_steps is None:
    train_steps = train_config.num_steps if train_config.num_steps else None

  if eval_steps is None:
    eval_steps = eval_config.num_examples if eval_config.num_examples else None

  detection_model_fn = functools.partial(
      model_builder.build, model_config=model_config)

  train_input_fn = inputs.create_train_input_fn(
      train_config=train_config,
      train_input_config=train_input_config,
      model_config=model_config)
  eval_input_fn = inputs.create_eval_input_fn(
      eval_config=eval_config,
      eval_input_config=eval_input_config,
      model_config=model_config)

  export_strategies = [
      tf.contrib.learn.utils.saved_model_export_utils.make_export_strategy(
          serving_input_fn=inputs.create_predict_input_fn(
              model_config=model_config))
  ]

  estimator = tf.estimator.Estimator(
      model_fn=model_fn_creator(detection_model_fn, configs, hparams),
      config=run_config)

  if run_config.is_chief:
    pipeline_config_final = config_util.create_pipeline_proto_from_configs(
        configs)
    pipeline_config_final_path = os.path.join(estimator.model_dir,
                                              'pipeline.config')
    config_text = text_format.MessageToString(pipeline_config_final)
    with tf.gfile.Open(pipeline_config_final_path, 'wb') as f:
      tf.logging.info('Writing as-run pipeline config file to %s',
                      pipeline_config_final_path)
      f.write(config_text)

  return tf.contrib.learn.Experiment(
      estimator=estimator,
      train_input_fn=train_input_fn,
      eval_input_fn=eval_input_fn,
      train_steps=train_steps,
      eval_steps=eval_steps,
      export_strategies=export_strategies,
      eval_delay_secs=120,)
Exemplo n.º 26
0
def populate_experiment(run_config,
                        hparams,
                        pipeline_config_path,
                        train_steps=None,
                        eval_steps=None,
                        model_fn_creator=create_model_fn,
                        **kwargs):
  """Populates an `Experiment` object.

  Args:
    run_config: A `RunConfig`.
    hparams: A `HParams`.
    pipeline_config_path: A path to a pipeline config file.
    train_steps: Number of training steps. If None, the number of training steps
      is set from the `TrainConfig` proto.
    eval_steps: Number of evaluation steps per evaluation cycle. If None, the
      number of evaluation steps is set from the `EvalConfig` proto.
    model_fn_creator: A function that creates a `model_fn` for `Estimator`.
      Follows the signature:

      * Args:
        * `detection_model_fn`: Function that returns `DetectionModel` instance.
        * `configs`: Dictionary of pipeline config objects.
        * `hparams`: `HParams` object.
      * Returns:
        `model_fn` for `Estimator`.

    **kwargs: Additional keyword arguments for configuration override.

  Returns:
    An `Experiment` that defines all aspects of training, evaluation, and
    export.
  """
  configs = config_util.get_configs_from_pipeline_file(pipeline_config_path)
  configs = config_util.merge_external_params_with_configs(
      configs,
      hparams,
      train_steps=train_steps,
      eval_steps=eval_steps,
      **kwargs)
  model_config = configs['model']
  train_config = configs['train_config']
  train_input_config = configs['train_input_config']
  eval_config = configs['eval_config']
  eval_input_config = configs['eval_input_config']

  if train_steps is None:
    train_steps = train_config.num_steps if train_config.num_steps else None

  if eval_steps is None:
    eval_steps = eval_config.num_examples if eval_config.num_examples else None

  detection_model_fn = functools.partial(
      model_builder.build, model_config=model_config)

  # Create the input functions for TRAIN/EVAL.
  train_input_fn = inputs.create_train_input_fn(
      train_config=train_config,
      train_input_config=train_input_config,
      model_config=model_config)
  eval_input_fn = inputs.create_eval_input_fn(
      eval_config=eval_config,
      eval_input_config=eval_input_config,
      model_config=model_config)

  export_strategies = [
      tf.contrib.learn.utils.saved_model_export_utils.make_export_strategy(
          serving_input_fn=inputs.create_predict_input_fn(
              model_config=model_config))
  ]

  estimator = tf.estimator.Estimator(
      model_fn=model_fn_creator(detection_model_fn, configs, hparams),
      config=run_config)

  if run_config.is_chief:
    # Store the final pipeline config for traceability.
    pipeline_config_final = config_util.create_pipeline_proto_from_configs(
        configs)
    pipeline_config_final_path = os.path.join(estimator.model_dir,
                                              'pipeline.config')
    config_text = text_format.MessageToString(pipeline_config_final)
    with tf.gfile.Open(pipeline_config_final_path, 'wb') as f:
      tf.logging.info('Writing as-run pipeline config file to %s',
                      pipeline_config_final_path)
      f.write(config_text)

  return tf.contrib.learn.Experiment(
      estimator=estimator,
      train_input_fn=train_input_fn,
      eval_input_fn=eval_input_fn,
      train_steps=train_steps,
      eval_steps=eval_steps,
      export_strategies=export_strategies,
      eval_delay_secs=120,)
Exemplo n.º 27
0
def set_config(config_path: Union[str, Path],
               checkpoint_path: Union[str, Path],
               tf_records_train_path: Union[str, Path],
               label_map: Dict[str, int],
               label_map_filepath: Union[str, Path],
               batch_size: int,
               max_box_predictions: int,
               max_number_of_boxes: int,
               fine_tune_checkpoint_type: str = 'detection',
               augment_path: str = None,
               min_dimension: int = None,
               max_dimension: int = None,
               total_steps: int = None,
               warmup_steps: int = None,
               num_steps: int = None):
    logger.info(f"Set configs {config_path}...")

    configs = get_configs_from_pipeline_file(str(config_path))

    train_len = count_tfrecord_examples(str(tf_records_train_path))
    logger.info(f"Train has {train_len} tf_records.")
    num_classes = len(set(label_map.values()))
    _, config_model = configs['model'].ListFields()[0]
    config_model.num_classes = num_classes

    configs[
        'model'].center_net.object_center_params.max_box_predictions = max_box_predictions
    if min_dimension is not None:
        configs[
            'model'].center_net.image_resizer.keep_aspect_ratio_resizer.min_dimension = min_dimension
    if max_dimension is not None:
        configs[
            'model'].center_net.image_resizer.keep_aspect_ratio_resizer.max_dimension = max_dimension

    configs[
        'train_config'].fine_tune_checkpoint_type = fine_tune_checkpoint_type
    configs['train_config'].fine_tune_checkpoint = str(checkpoint_path)
    configs['train_config'].batch_size = batch_size

    configs['train_config'].max_number_of_boxes = max_number_of_boxes
    if total_steps is not None:
        configs[
            'train_config'].optimizer.adam_optimizer.learning_rate.cosine_decay_learning_rate.total_steps = total_steps
    if warmup_steps is not None:
        configs[
            'train_config'].optimizer.adam_optimizer.learning_rate.cosine_decay_learning_rate.warmup_steps = warmup_steps
    if num_steps is not None:
        configs['train_config'].num_steps = num_steps

    if augment_path is not None:
        augment_config = configs['train_config'].data_augmentation_options
        for _ in augment_config:
            augment_config.pop()
        augment = text_format.Merge(str(augment_path),
                                    pipeline_pb2.TrainEvalPipelineConfig())
        augment_config.extend(augment.train_config.data_augmentation_options)

    label_map_to_file(label_map=label_map, filepath=label_map_filepath)

    def clear_repeated_proto(proto):
        for _ in proto:
            proto.pop()

    configs['train_input_config'].label_map_path = str(label_map_filepath)
    clear_repeated_proto(
        configs['train_input_config'].tf_record_input_reader.input_path)
    configs['train_input_config'].tf_record_input_reader.input_path.append(
        str(tf_records_train_path))

    pipeline_proto = create_pipeline_proto_from_configs(configs)
    save_pipeline_config(pipeline_proto, str(config_path.parent))
    logger.info(f"Config {config_path} changed")