def test_save_pipeline_config(self): """Tests that the pipeline config is properly saved to disk.""" pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() pipeline_config.model.faster_rcnn.num_classes = 10 pipeline_config.train_config.batch_size = 32 pipeline_config.train_input_reader.label_map_path = "path/to/label_map" pipeline_config.eval_config.num_examples = 20 pipeline_config.eval_input_reader.queue_capacity = 100 config_util.save_pipeline_config(pipeline_config, self.get_temp_dir()) configs = config_util.get_configs_from_pipeline_file( os.path.join(self.get_temp_dir(), "pipeline.config")) pipeline_config_reconstructed = ( config_util.create_pipeline_proto_from_configs(configs)) self.assertEqual(pipeline_config, pipeline_config_reconstructed)
def test_create_pipeline_proto_from_configs(self): """Tests that proto can be reconstructed from configs dictionary.""" pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() pipeline_config.model.faster_rcnn.num_classes = 10 pipeline_config.train_config.batch_size = 32 pipeline_config.train_input_reader.label_map_path = "path/to/label_map" pipeline_config.eval_config.num_examples = 20 pipeline_config.eval_input_reader.queue_capacity = 100 _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) pipeline_config_reconstructed = ( config_util.create_pipeline_proto_from_configs(configs)) self.assertEqual(pipeline_config, pipeline_config_reconstructed)
def test_save_pipeline_config(self): """Tests that the pipeline config is properly saved to disk.""" pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() pipeline_config.model.faster_rcnn.num_classes = 10 pipeline_config.train_config.batch_size = 32 pipeline_config.train_input_reader.label_map_path = "path/to/label_map" pipeline_config.eval_config.num_examples = 20 pipeline_config.eval_input_reader.add().queue_capacity = 100 config_util.save_pipeline_config(pipeline_config, self.get_temp_dir()) configs = config_util.get_configs_from_pipeline_file( os.path.join(self.get_temp_dir(), "pipeline.config")) pipeline_config_reconstructed = ( config_util.create_pipeline_proto_from_configs(configs)) self.assertEqual(pipeline_config, pipeline_config_reconstructed)
def test_create_pipeline_proto_from_configs(self): """Tests that proto can be reconstructed from configs dictionary.""" pipeline_config_path = os.path.join(self.get_temp_dir(), "pipeline.config") pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() pipeline_config.model.faster_rcnn.num_classes = 10 pipeline_config.train_config.batch_size = 32 pipeline_config.train_input_reader.label_map_path = "path/to/label_map" pipeline_config.eval_config.num_examples = 20 pipeline_config.eval_input_reader.add().queue_capacity = 100 _write_config(pipeline_config, pipeline_config_path) configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) pipeline_config_reconstructed = ( config_util.create_pipeline_proto_from_configs(configs)) self.assertEqual(pipeline_config, pipeline_config_reconstructed)
def create_model(pipeline_config_path, output_directory, checkpoint_path): tf.keras.backend.clear_session() print('Building model and restoring weights for fine-tuning...', flush=True) num_classes = 1 output_checkpoint_dir = os.path.join(output_directory, 'checkpoint') configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) model_config = configs['model'] model_config.ssd.num_classes = num_classes model_config.ssd.freeze_batchnorm = True detection_model = model_builder.build(model_config=model_config, is_training=True) pipeline_proto = config_util.create_pipeline_proto_from_configs(configs) config_util.save_pipeline_config(pipeline_proto, output_directory) latest_checkpoint_number = int(checkpoint_path.split('-')[-1]) print(latest_checkpoint_number) if latest_checkpoint_number == 0: fake_box_predictor = tf.compat.v2.train.Checkpoint( _base_tower_layers_for_heads=detection_model._box_predictor. _base_tower_layers_for_heads, # _prediction_heads=detection_model._box_predictor._prediction_heads, # (i.e., the classification head that we *will not* restore) _box_prediction_head=detection_model._box_predictor. _box_prediction_head, ) fake_model = tf.compat.v2.train.Checkpoint( _feature_extractor=detection_model._feature_extractor, _box_predictor=fake_box_predictor) ckpt = tf.compat.v2.train.Checkpoint(model=fake_model) ckpt.restore(checkpoint_path).expect_partial() exported_ckpt = tf.compat.v2.train.Checkpoint(model=detection_model) ckpt_manager = tf.train.CheckpointManager(exported_ckpt, output_checkpoint_dir, max_to_keep=1) if latest_checkpoint_number > 0: status = exported_ckpt.restore(ckpt_manager.latest_checkpoint) image, shapes = detection_model.preprocess(tf.zeros([1, 320, 320, 3])) prediction_dict = detection_model.predict(image, shapes) _ = detection_model.postprocess(prediction_dict, shapes) print('Weights restored!') return detection_model, pipeline_proto, ckpt_manager
def __enter__(self): try: self._checkpoint_model_folder = ModelZoo.download_model(self._model_name) self._checkpoint_model_pipeline_file = ModelZoo.download_pipeline(self._model_name) # load pipeline if self._pipeline_file.exists(): configs = get_configs_from_pipeline_file(str(self._pipeline_file)) # load config as a dict else: configs = get_configs_from_pipeline_file( str(self._checkpoint_model_pipeline_file)) # load config as a dict self._pipeline = create_pipeline_proto_from_configs(configs) # convert to a protobuffer # load dataset self._load_dataset() self._set_config_paths() return self except Exception as ex: raise Exception("Error loading the model : {}".format(ex)) from ex
def create_pipeline_proto_from_configs(configs): """Creates a pipeline_pb2.TrainEvalPipelineConfig from configs dictionary. This function nearly performs the inverse operation of get_configs_from_pipeline_file(). Instead of returning a file path, it returns a `TrainEvalPipelineConfig` object. Args: configs: Dictionary of configs. See get_configs_from_pipeline_file(). Returns: A fully populated pipeline_pb2.TrainEvalPipelineConfig. """ pipeline_config = config_util.create_pipeline_proto_from_configs(configs) if "lstm_model" in configs: pipeline_config.Extensions[internal_pipeline_pb2.lstm_model].CopyFrom( configs["lstm_model"]) return pipeline_config
def create_pipeline_proto_from_configs(configs): """Creates a pipeline_pb2.TrainEvalPipelineConfig from configs dictionary. This function nearly performs the inverse operation of get_configs_from_pipeline_file(). Instead of returning a file path, it returns a `TrainEvalPipelineConfig` object. Args: configs: Dictionary of configs. See get_configs_from_pipeline_file(). Returns: A fully populated pipeline_pb2.TrainEvalPipelineConfig. """ pipeline_config = config_util.create_pipeline_proto_from_configs(configs) if "lstm_model" in configs: pipeline_config.Extensions[internal_pipeline_pb2.lstm_model].CopyFrom( configs["lstm_model"]) return pipeline_config
def get_configuration_content(self, network_info: NetworkInformation) -> str: try: network_path: str = os.path.join(self.path.weights_dir, network_info.network_architecture, "pipeline.config") config_file_content: Dict[str, str] = get_configs_from_pipeline_file(network_path) checkpoint_path = os.path.join(self.path.weights_dir, network_info.network_architecture, 'checkpoint/ckpt-0') content: Dict[str, str] = self._adjust_configuration_content(config_file_content=config_file_content, network_path=checkpoint_path) # the return of proto dict make error so we save the file and read it with python reader pipeline_config = create_pipeline_proto_from_configs(content) save_pipeline_config(pipeline_config, "/tmp/") content_str: str = open("/tmp/pipeline.config", "r").read() return content_str except Exception as e: raise ConfigurationPipelineNotFound(additional_message=e.__str__(), pipeline_path=network_path)
def _set_config_paths(self): configs = create_configs_from_pipeline_proto(self.pipeline) update_input_reader_config(configs, key_name="train_input_config", input_name=None, field_name="input_path", value=str(self._val_record_file), path_updater=_update_tf_record_input_path) update_input_reader_config(configs, key_name="eval_input_configs", input_name=None, field_name="input_path", value=str(self._train_record_file), path_updater=_update_tf_record_input_path) update_dict = { "label_map_path": str(self._labels_map_file), "train_config.fine_tune_checkpoint": str(self._checkpoint_model_folder.joinpath("model.ckpt")) } configs = merge_external_params_with_configs(configs, kwargs_dict=update_dict) self._pipeline = create_pipeline_proto_from_configs(configs)
def override_pipeline_configs(config_file, overrides, out_dir=""): configs = config_util.get_configs_from_pipeline_file(config_file) configs['train_config'].from_detection_checkpoint = True configs['eval_config'].num_examples = 25000 for field, value in overrides.items(): if field == "num_classes": set_number_of_classes(configs['model'], value) elif field == "width_height": set_resizer_width_height(configs['model'], value[0], value[1]) elif not config_util._maybe_update_config_with_key_value( configs, field, value): try: config_util._update_generic(configs, field, value) except ValueError as ex: if field == "train_config.fine_tune_checkpoint": configs['train_config'].fine_tune_checkpoint = value else: raise config_util.save_pipeline_config( config_util.create_pipeline_proto_from_configs(configs), out_dir)
def build_config_str(self): if self.config.nclasses is None: raise RequiredConfigMissingError('nclasses must be configured') if self.config.record_train_path is None: raise RequiredConfigMissingError( 'record_train_path must be configured') if self.config.record_eval_path is None: raise RequiredConfigMissingError( 'record_eval_path must be configured') if self.config.labelmap_path is None: raise RequiredConfigMissingError( 'labelmap_path must be configured') if self.config.checkpoint_path is None: raise RequiredConfigMissingError( 'checkpoint_path must be configured') if self.config.use_checkpoint is None: raise RequiredConfigMissingError( 'use_checkpoint must be configured') # TODO: implement augmentation options proto = config_util.create_pipeline_proto_from_configs( self.pipeline_config) return text_format.MessageToString(proto)
def main(_): arch_details = arch_map[FLAGS.architecture] # check graph type, download graph graph_url = arch_details['url'] graph_path = '/models/research/object_detection/data/' maybe_download_and_extract(graph_url, graph_path) # Open config file config_path = os.path.join( '/models/research/object_detection/samples/configs', arch_details['config']) configs = config_util.get_configs_from_pipeline_file(config_path) # Update paths in config hparams = tf.contrib.training.HParams( label_map_path=FLAGS.label_map_path, train_input_path=os.path.join(FLAGS.data_dir, 'train.record'), eval_input_path=os.path.join(FLAGS.data_dir, 'val.record')) if FLAGS.hparams: for key, val in json.loads(FLAGS.hparams).iteritems(): hparams.add_hparam(key, val) config_util.merge_external_params_with_configs(configs, hparams) # Save config inside dataset configs["train_config"].fine_tune_checkpoint = os.path.join( graph_path, arch_details['checkpoint'], 'model.ckpt') config_proto = config_util.create_pipeline_proto_from_configs(configs) config_str = text_format.MessageToString(config_proto) experiment_path = os.path.join(FLAGS.data_dir, FLAGS.experiment_id) if not os.path.exists(experiment_path): os.makedirs(experiment_path) with open(os.path.join(experiment_path, 'pipeline.config'), 'w') as config_file: config_file.write(config_str)
def patch_pipeline_config(self, model_base_name): self.label_map_path = os.path.join(self.src_train_path, "label_map.pbtxt") model_base_dir_path = os.path.join(self.path_perm_storage, "model_base_checkpoints", model_base_name) config_path = os.path.join(model_base_dir_path, "pipeline.config") cf_dict = config_util.get_configs_from_pipeline_file(config_path) cf_dict["model"].ssd.num_classes = self.num_classes cf_dict["train_config"].fine_tune_checkpoint = os.path.join( model_base_dir_path, "ckpt-0") cf_dict["train_config"].batch_size = self.batch_size cf_dict["train_config"].use_bfloat16 = False cf_dict["train_input_config"].label_map_path = self.label_map_path cf_dict[ "train_input_config"].tf_record_input_reader.input_path[:] = self.scan_dir_for_records( DatasetType.training.name) cf_dict["eval_input_config"].label_map_path = self.label_map_path cf_dict[ "eval_input_config"].tf_record_input_reader.input_path[:] = self.scan_dir_for_records( DatasetType.evaluation.name) cf_obj = config_util.create_pipeline_proto_from_configs(cf_dict) tmp_config_path = os.path.join(self.path_perm_storage, "patched_config") config_util.save_pipeline_config(cf_obj, tmp_config_path) self.patched_config_path = os.path.join(tmp_config_path, "pipeline.config") print("Source configuration was patched: {0}".format( self.patched_config_path))
def edit_config(model_selected, config_output_dir, num_steps, label_map_path, record_dir, eval_number, annotation_type, batch_size=None, learning_rate=None, resizer_size=None): ''' Wrapper to edit the essential values inside the base configuration protobuf file provided with an object-detection/segmentation checkpoint. This configuration file is what will entirely define your model, pre-processing, training, evaluation etc. It is the most important file of a model with the checkpoint file and should never be deleted. This is why it is saved in almost every directory where you did something to keep redondancy but also to be sure to have the right config file used at this moment. For advanced users, if you want to dwell deep inside the configuration file you should read the proto definitions inside the proto directory of the object-detection API. Args: Required: model_selected: The checkpoint you want to resume from. config_output_dir: The path where you want to save your edited protobuf configuration file. num_steps: The number of steps you want to train on. label_map_path: The path to your label_map.pbtxt file. record_dir: The path to the directory where your TFRecord files are saved. eval_number: The number of images you want to evaluate on. annotation_type: Should be either "rectangle" or "polygon", depending on how you annotated your images. Optional: batch_size: The batch size you want to use. If not provided it will use the previous one. learning_rate: The learning rate you want to use for the training. If not provided it will use the previous one. Please see config_utils.update_initial_learning_rate() inside the object_detection folder for indepth details on what happens when updating it. resizer_size: The shape used to update your image resizer. Please see set_image_resizer() for more details on this. If not provided it will use the previous one. ''' file_list = os.listdir(model_selected) ckpt_ids = [] for p in file_list: if "index" in p: if "-" in p: ckpt_ids.append(int(p.split('-')[1].split('.')[0])) if len(ckpt_ids) > 0: ckpt_path = os.path.join(model_selected, "model.ckpt-{}".format(str(max(ckpt_ids)))) else: ckpt_path = os.path.join(model_selected, "model.ckpt") configs = config_util.get_configs_from_pipeline_file( os.path.join(model_selected, 'pipeline.config')) label_map = label_map_util.load_labelmap(label_map_path) config_util._update_train_steps(configs, num_steps) update_different_paths( configs, ckpt_path=ckpt_path, label_map_path=label_map_path, train_record_path=os.path.join(record_dir, "train.record"), eval_record_path=os.path.join(record_dir, "eval.record")) if learning_rate is not None: config_util._update_initial_learning_rate(configs, learning_rate) if batch_size is not None: config_util._update_batch_size(configs, batch_size) if annotation_type == "polygon": edit_masks(configs, mask_type="PNG_MASKS") if resizer_size is not None: set_image_resizer(configs, resizer_size) edit_eval_config(configs, annotation_type, eval_number) update_num_classes(configs, label_map) config_proto = config_util.create_pipeline_proto_from_configs(configs) config_util.save_pipeline_config(config_proto, directory=config_output_dir)
def populate_experiment(run_config, hparams, pipeline_config_path, train_steps=None, eval_steps=None, model_fn_creator=create_model_fn, **kwargs): """Populates an `Experiment` object. Args: run_config: A `RunConfig`. hparams: A `HParams`. pipeline_config_path: A path to a pipeline config file. train_steps: Number of training steps. If None, the number of training steps is set from the `TrainConfig` proto. eval_steps: Number of evaluation steps per evaluation cycle. If None, the number of evaluation steps is set from the `EvalConfig` proto. model_fn_creator: A function that creates a `model_fn` for `Estimator`. Follows the signature: * Args: * `detection_model_fn`: Function that returns `DetectionModel` instance. * `configs`: Dictionary of pipeline config objects. * `hparams`: `HParams` object. * Returns: `model_fn` for `Estimator`. **kwargs: Additional keyword arguments for configuration override. Returns: An `Experiment` that defines all aspects of training, evaluation, and export. """ configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) configs = config_util.merge_external_params_with_configs( configs, hparams, train_steps=train_steps, eval_steps=eval_steps, **kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] eval_config = configs['eval_config'] eval_input_config = configs['eval_input_config'] if train_steps is None: train_steps = train_config.num_steps if train_config.num_steps else None if eval_steps is None: eval_steps = eval_config.num_examples if eval_config.num_examples else None detection_model_fn = functools.partial( model_builder.build, model_config=model_config) # Create the input functions for TRAIN/EVAL. train_input_fn = inputs.create_train_input_fn( train_config=train_config, train_input_config=train_input_config, model_config=model_config) eval_input_fn = inputs.create_eval_input_fn( eval_config=eval_config, eval_input_config=eval_input_config, model_config=model_config) export_strategies = [ tf.contrib.learn.utils.saved_model_export_utils.make_export_strategy( serving_input_fn=inputs.create_predict_input_fn( model_config=model_config)) ] estimator = tf.estimator.Estimator( model_fn=model_fn_creator(detection_model_fn, configs, hparams), config=run_config) if run_config.is_chief: # Store the final pipeline config for traceability. pipeline_config_final = config_util.create_pipeline_proto_from_configs( configs) pipeline_config_final_path = os.path.join(estimator.model_dir, 'pipeline.config') config_text = text_format.MessageToString(pipeline_config_final) with tf.gfile.Open(pipeline_config_final_path, 'wb') as f: tf.logging.info('Writing as-run pipeline config file to %s', pipeline_config_final_path) f.write(config_text) return tf.contrib.learn.Experiment( estimator=estimator, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, train_steps=train_steps, eval_steps=eval_steps, export_strategies=export_strategies, eval_delay_secs=120,)
import tensorflow as tf # Assuming object detection API is available for use from object_detection.utils.config_util import create_pipeline_proto_from_configs from object_detection.utils.config_util import get_configs_from_pipeline_file import export # Configuration for model to be exported config_pathname = 'training/faster_rcnn_inception_v2_pets.config' # Input checkpoint for the model to be exported # Path to the directory which consists of the saved model on disk (see above) trained_model_dir = '/home/rice/PycharmProjects/mytensorflow_sample/MODEL/' # Create proto from model confguration configs = get_configs_from_pipeline_file(config_pathname) pipeline_proto = create_pipeline_proto_from_configs(configs=configs) # Read .ckpt and .meta files from model directory checkpoint = tf.train.get_checkpoint_state(trained_model_dir) input_checkpoint = checkpoint.model_checkpoint_path # Model Version model_version_id = '1' # Output Directory output_directory = '/home/rice/tensorflow1/models/research/object_detection/' + str(model_version_id) # Export model for serving export.export_inference_graph(input_type='image_tensor',pipeline_config=pipeline_proto,trained_checkpoint_prefix=input_checkpoint,output_directory=output_directory)
def start_training(self): """Start training for the model""" worker_replicas = 1 ps_tasks = 0 clone_on_cpu = False num_clones = 1 ensure_path(config.BASE_MODELS_PATH) train_dir = self.train_dir model_json_path = os.path.join(train_dir, 'job.json') job = self.job num_steps = int(job['steps']) try: if config.DEBUG: num_steps = 50 except AttributeError: pass except Exception as e: _LOGGER.error(e) job = api.update_job_state(job, 'training', 'Start training for {} steps'.format(num_steps)) model = self.model ensure_path(config.EXPORTED_MODELS) model_graph = os.path.join(config.EXPORTED_MODELS, '{}.pb'.format(model['file_name'])) if not os.path.exists(os.path.join(train_dir, 'checkpoint')): # New training started _LOGGER.debug("Checkpoints doesn't exists") base_checkpoints_path = os.path.join(config.BASE_MODELS_PATH, model['architecture']) _tmf = os.path.join(config.TRAINED_MODELS_DATA, model['file_name']) if os.path.isdir(_tmf): _LOGGER.debug("Model already exists as %s" % model_graph) base_checkpoints_path = _tmf elif model['type'] == 'new': _LOGGER.debug("model type new") else: _LOGGER.debug("New model from parent model") parent_model = api.get_model(model['parent']) if not parent_model: raise Exception('Parent model not found on server') parent_tmf = os.path.join(config.TRAINED_MODELS_DATA, parent_model['file_name']) if os.path.isdir(parent_tmf): base_checkpoints_path = parent_tmf else: _LOGGER.error("Parent model not found. please train it first") return False if not os.path.exists(os.path.join(base_checkpoints_path, 'model.ckpt.meta')): _LOGGER.debug("Base model not found for %s, Downloading now." % model['architecture']) _f = api.download_model_files(model['architecture']) tmp_model_data = os.path.join(config.DATA_DIR, 'tmp_model_data') if tarfile.is_tarfile(_f): if os.path.exists(tmp_model_data): shutil.rmtree(tmp_model_data) ensure_path(tmp_model_data) print("Tar file found") shutil.unpack_archive(_f, tmp_model_data) for root, dirs, files in os.walk(tmp_model_data): for file in files: if 'model.ckpt' in file: path = os.path.join(root, file) # print(path) ensure_path(base_checkpoints_path) shutil.copy(path, os.path.join(base_checkpoints_path, file)) else: _LOGGER.error("Invalid file") return False if os.path.exists(train_dir): shutil.rmtree(train_dir) shutil.copytree(base_checkpoints_path, train_dir) if os.path.exists(os.path.join(train_dir, 'checkpoint')): os.remove(os.path.join(train_dir, 'checkpoint')) if os.path.exists(os.path.join(train_dir, 'data')): shutil.rmtree(os.path.join(train_dir, 'data')) shutil.copytree(self.data_dir, os.path.join(train_dir, 'data')) counts = {'train': 0, 'test': 1000, 'classes': 1} stats_file = os.path.join(train_dir, "data", "stats.json") try: with open(stats_file) as _f: counts = json.load(_f) except: pass pipeline_config_path = os.path.join(train_dir, 'pipeline.config') if not os.path.exists(pipeline_config_path): pipeline_config_path = os.path.join(self.configs_dir, "{}.config".format(model['architecture'])) task = '0' if task == '0': tf.gfile.MakeDirs(train_dir) if pipeline_config_path: _LOGGER.info("Pipeline config file : {}".format(pipeline_config_path)) configs = config_util.get_configs_from_pipeline_file( pipeline_config_path) if task == '0': tf.gfile.Copy(pipeline_config_path, os.path.join(train_dir, 'pipeline.config'), overwrite=True) else: _LOGGER.error("No config found") return False pipeline_config_path = os.path.join(train_dir, 'pipeline.config') # with open(model_json_path, 'w') as mf: # json.dump(job, mf) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] if model_config.HasField('faster_rcnn'): model_config.faster_rcnn.num_classes = counts['classes'] if model_config.HasField('ssd'): model_config.ssd.num_classes = counts['classes'] # Set num_steps train_config.num_steps = num_steps train_config.fine_tune_checkpoint = os.path.join(train_dir, 'model.ckpt') # Update input config to use updated list of input input_config.tf_record_input_reader.ClearField('input_path') input_config.tf_record_input_reader.input_path.append(os.path.join(train_dir, 'data', "train_baheads.tfrecord-??????")) input_config.label_map_path = os.path.join(train_dir, 'data', "labels.pbtxt") eval_config = configs['eval_config'] eval_input_config = configs['eval_input_config'] eval_config.num_examples = counts['test'] eval_config.max_evals = 1 # Update input config to use updated list of input eval_input_config.tf_record_input_reader.ClearField('input_path') eval_input_config.tf_record_input_reader.input_path.append(os.path.join(train_dir, 'data', "test_baheads.tfrecord-??????")) eval_input_config.label_map_path = os.path.join(train_dir, 'data', "labels.pbtxt") # Save the updated config to pipeline file config_util.save_pipeline_config(config_util.create_pipeline_proto_from_configs({ 'model': model_config, 'train_config': train_config, 'train_input_config': input_config, 'eval_config': eval_config, 'eval_input_config': eval_input_config }), train_dir) model_fn = functools.partial( model_builder.build, model_config=model_config, is_training=True) def get_next(config): return dataset_builder.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object,), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError('At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=True) if not os.path.exists(os.path.join(train_dir, 'model.ckpt-{}.meta'.format(num_steps))): status_timer = StatusThread(tfh, num_steps, job) status_timer.start() try: trainer.train( create_input_dict_fn, model_fn, train_config, master, task, num_clones, worker_replicas, clone_on_cpu, ps_tasks, worker_job_name, is_chief, train_dir, graph_hook_fn=graph_rewriter_fn) except KeyboardInterrupt: raise finally: status_timer.stop() if status_timer.is_alive(): _LOGGER.info("Waiting for status thread to close") status_timer.join() if os.path.exists(os.path.join(train_dir, 'model.ckpt-{}.meta'.format(num_steps))): # Training complete. Export model _LOGGER.debug("Training complete for %d steps" % num_steps) job = api.update_job_state(job, 'training', 'Training complete') export_path = os.path.join(config.TRAINED_MODELS_DATA, model['file_name']) if os.path.exists(export_path): shutil.rmtree(export_path) ckpt_path = os.path.join(train_dir, 'model.ckpt-{}'.format(num_steps)) exporter.export(pipeline_config_path, export_path, ckpt_path) frozen_graph = os.path.join(export_path, 'frozen_inference_graph.pb') if os.path.exists(frozen_graph): # Successfully exported shutil.copy(frozen_graph, model_graph) shutil.copy( os.path.join(train_dir, 'data', "labels.pbtxt"), os.path.join(config.EXPORTED_MODELS, '{}.pbtxt'.format(model['file_name'])) ) # TODO: Eval the trained graph, Push the result to server. eval_dir = 'eval_dir' tf.reset_default_graph() eval_result = run_eval(train_dir, eval_dir, pipeline_config_path, counts['test']) if 'PascalBoxes_Precision/[email protected]' in eval_result: acc = eval_result['PascalBoxes_Precision/[email protected]'] * 100 _LOGGER.info("PascalBoxes_Precision/[email protected] : %d %%" % (acc)) job = api.update_job_state(job, 'complete', 'PascalBoxes_Precision %d %%' % (acc)) _LOGGER.info(eval_result) if os.path.exists(train_dir): shutil.rmtree(train_dir) return True return False
def save_pipeline(pipeline_dict, out_folder): pipeline_proto = create_pipeline_proto_from_configs(pipeline_dict) save_pipeline_config(pipeline_proto, out_folder)
num_classes, 'train_config.fine_tune_checkpoint': os.path.join(os.getcwd(), 'pretrained_models', model_name, 'model.ckpt'), 'label_map_path': label_pbtxt, 'train_input_path': train_record, 'eval_input_path': eval_record, } # 更新config文件 tf.logging.info("Updata config file {}".format(pipeline_config)) config = config_util.merge_external_params_with_configs( config, kwargs_dict=config_updata) config = config_util.create_pipeline_proto_from_configs(config) with tf.gfile.Open(pipeline_config, "wb") as f: f.write(text_format.MessageToString(config)) # 训练结果保存位置 save_path = os.path.join(os.getcwd(), 'weights', model_name) if not os.path.exists(save_path): os.mkdir(save_path) def main(unused_argv): train_and_eval_dict = model_lib.create_estimator_and_inputs( run_config=tf.estimator.RunConfig(model_dir=save_path), hparams=model_hparams.create_hparams(None), pipeline_config_path=pipeline_config, train_steps=None,
def update_pipeline_config(params, eval_type): cfg = config_util.get_configs_from_pipeline_file( os.path.join(params.config_mnt, params.config_dir)) # update num_of_classes model_name = os.path.basename( os.path.normpath(os.path.join(params.config_mnt, params.config_dir))).lower() print("model name: ", model_name) if model_name.startswith("ssd"): model_cfg = cfg['model'].ssd elif model_name.startswith("faster_rcnn"): model_cfg = cfg['model'].faster_rcnn else: raise ValueError( 'unknown base model {}, we can only handle ssd or faster_rcnn'. format(model_name)) label_map = os.path.join(params.config_mnt, params.label_dir) label_map_dict = label_map_util.get_label_map_dict(label_map) num_classes = len(label_map_dict) model_cfg.num_classes = num_classes # update base_model_dir train_cfg = cfg['train_config'] train_cfg.fine_tune_checkpoint = os.path.join(params.config_mnt, params.transfer_learning_dir, 'model.ckpt') eval_cfg = cfg['eval_config'] eval_cfg.max_evals = 1 eval_cfg.num_examples = int(params.eval_num_examples) # update num_train_steps, label_map_path, train_tfrecords, val_tfrecords, batch size\ print( os.path.join(os.path.sep, params.base_mnt, params.source_data_name, 'tf_records', 'train.record')) hparams = tf.contrib.training.HParams( batch_size=int(params.batch_size), train_steps=int(params.num_steps), label_map_path=label_map, train_input_path=os.path.join(os.path.sep, params.base_mnt, params.source_data_name, 'tf_records', 'train.record'), eval_input_path=os.path.join(os.path.sep, params.base_mnt, params.source_data_name, 'tf_records', eval_type + '.record'), ) cfg = config_util.merge_external_params_with_configs(cfg, hparams) # log metrics run_context = Run.get_context() run_context.log("Batch Size", int(params.batch_size)) run_context.log("Training Steps", int(params.num_steps)) # run.log("Maximum Evaluations",max_evals) updated_pipeline_config = config_util.create_pipeline_proto_from_configs( cfg) print("updated_pipeline_config: ", updated_pipeline_config) updated_pipeline_config_file = os.path.join(params.config_mnt, params.config_dir) print("updated_pipeline_config_file: ", updated_pipeline_config_file) print("dir name: ", os.path.dirname(os.path.join(params.config_mnt, params.config_dir))) config_util.save_pipeline_config( updated_pipeline_config, os.path.join(params.base_mnt, params.source_data_name, 'model_config')) return updated_pipeline_config, updated_pipeline_config_file
import tensorflow as tf # Assuming object detection API is available for use from object_detection.utils.config_util import create_pipeline_proto_from_configs from object_detection.utils.config_util import get_configs_from_pipeline_file import object_detection.exporter # Configuration for model to be exported config_pathname = '/home/stash/projects/aadhar_identification/models/research/object_detection/samples/configs/faster_rcnn_resnet101_pets.config' # Input checkpoint for the model to be exported # Path to the directory which consists of the saved model on disk (see above) trained_model_dir = '/home/stash/projects/aadhar_identification/models/research/exported_graphs_171020181048' # Create proto from model confguration configs = get_configs_from_pipeline_file(config_pathname) pipeline_proto = create_pipeline_proto_from_configs(configs=configs) # Read .ckpt and .meta files from model directory checkpoint = tf.train.get_checkpoint_state(trained_model_dir) input_checkpoint = checkpoint.model_checkpoint_path # Model Version model_version_id = 1 # Output Directory output_directory = '/home/stash/projects/aadhar_identification/models/research/exported_graphs_301020181727' + str(model_version_id) # Export model for serving object_detection.exporter.export_inference_graph(input_type='image_tensor',pipeline_config=pipeline_proto,trained_checkpoint_prefix=input_checkpoint,output_directory=output_directory)
def eval_continuously( pipeline_config_path, config_override=None, train_steps=None, sample_1_of_n_eval_examples=1, sample_1_of_n_eval_on_train_examples=1, use_tpu=False, override_eval_num_epochs=True, postprocess_on_cpu=False, model_dir=None, checkpoint_dir=None, wait_interval=180, timeout=3600, eval_index=0, save_final_config=False, **kwargs): """Run continuous evaluation of a detection model eagerly. This method builds the model, and continously restores it from the most recent training checkpoint in the checkpoint directory & evaluates it on the evaluation data. Args: pipeline_config_path: A path to a pipeline config file. config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to override the config from `pipeline_config_path`. train_steps: Number of training steps. If None, the number of training steps is set from the `TrainConfig` proto. sample_1_of_n_eval_examples: Integer representing how often an eval example should be sampled. If 1, will sample all examples. sample_1_of_n_eval_on_train_examples: Similar to `sample_1_of_n_eval_examples`, except controls the sampling of training data for evaluation. use_tpu: Boolean, whether training and evaluation should run on TPU. override_eval_num_epochs: Whether to overwrite the number of epochs to 1 for eval_input. postprocess_on_cpu: When use_tpu and postprocess_on_cpu are true, postprocess is scheduled on the host cpu. model_dir: Directory to output resulting evaluation summaries to. checkpoint_dir: Directory that contains the training checkpoints. wait_interval: The mimmum number of seconds to wait before checking for a new checkpoint. timeout: The maximum number of seconds to wait for a checkpoint. Execution will terminate if no new checkpoints are found after these many seconds. eval_index: int, If given, only evaluate the dataset at the given index. By default, evaluates dataset at 0'th index. save_final_config: Whether to save the pipeline config file to the model directory. **kwargs: Additional keyword arguments for configuration override. """ config_override = None configs = config_util.get_configs_from_pipeline_file( pipeline_config_path, config_override=config_override) # get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[ # 'get_configs_from_pipeline_file'] # create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[ # 'create_pipeline_proto_from_configs'] # merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[ # 'merge_external_params_with_configs'] # configs = get_configs_from_pipeline_file( # pipeline_config_path, config_override=config_override) kwargs.update({ 'sample_1_of_n_eval_examples': sample_1_of_n_eval_examples, 'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu }) if train_steps is not None: kwargs['train_steps'] = train_steps if override_eval_num_epochs: kwargs.update({'eval_num_epochs': 1}) # tf.logging.warning( # 'Forced number of epochs for all eval validations to be 1.') configs = config_util.merge_external_params_with_configs( configs, None, kwargs_dict=kwargs) if model_dir and save_final_config: # tf.logging.info('Saving pipeline config file to directory {}'.format( # model_dir)) pipeline_config_final = config_util.create_pipeline_proto_from_configs( configs) config_util.save_pipeline_config(pipeline_config_final, model_dir) model_config = configs['model'] train_input_config = configs['train_input_config'] eval_config = configs['eval_config'] eval_input_configs = configs['eval_input_configs'] eval_on_train_input_config = copy.deepcopy(train_input_config) eval_on_train_input_config.sample_1_of_n_examples = ( sample_1_of_n_eval_on_train_examples) if override_eval_num_epochs and eval_on_train_input_config.num_epochs != 1: # tf.logging.warning('Expected number of evaluation epochs is 1, but ' # 'instead encountered `eval_on_train_input_config' # '.num_epochs` = ' # '{}. Overwriting `num_epochs` to 1.'.format( # eval_on_train_input_config.num_epochs)) eval_on_train_input_config.num_epochs = 1 if kwargs['use_bfloat16']: tf.compat.v2.keras.mixed_precision.set_global_policy('mixed_bfloat16') eval_input_config = eval_input_configs[eval_index] strategy = tf.compat.v2.distribute.get_strategy() with strategy.scope(): detection_model = model_builder.build( model_config=model_config, is_training=True) # detection_model = MODEL_BUILD_UTIL_MAP['detection_model_fn_base']( # model_config=model_config, is_training=True) eval_input = strategy.experimental_distribute_dataset( inputs.eval_input( eval_config=eval_config, eval_input_config=eval_input_config, model_config=model_config, model=detection_model)) global_step = tf.compat.v2.Variable( 0, trainable=False, dtype=tf.compat.v2.dtypes.int64) optimizer, _ = optimizer_builder.build( configs['train_config'].optimizer, global_step=global_step) for latest_checkpoint in tf.train.checkpoints_iterator( checkpoint_dir, timeout=timeout, min_interval_secs=wait_interval): ckpt = tf.compat.v2.train.Checkpoint( step=global_step, model=detection_model, optimizer=optimizer) # We run the detection_model on dummy inputs in order to ensure that the # model and all its variables have been properly constructed. Specifically, # this is currently necessary prior to (potentially) creating shadow copies # of the model variables for the EMA optimizer. if eval_config.use_moving_averages: unpad_groundtruth_tensors = ( eval_config.batch_size == 1 and not use_tpu) _ensure_model_is_built(detection_model, eval_input, unpad_groundtruth_tensors) optimizer.shadow_copy(detection_model) ckpt.restore(latest_checkpoint).expect_partial() if eval_config.use_moving_averages: optimizer.swap_weights() summary_writer = tf.compat.v2.summary.create_file_writer( os.path.join(model_dir, 'eval', eval_input_config.name)) with summary_writer.as_default(): eager_eval_loop( detection_model, configs, eval_input, use_tpu=use_tpu, postprocess_on_cpu=postprocess_on_cpu, global_step=global_step, )
def train_loop(config_path: str, model_dir: str, config_override: Optional[ pipeline_pb2.TrainEvalPipelineConfig] = None, train_steps: Optional[int] = None, use_tpu: bool = False, save_final_config: bool = False, log_every_n: int = 100, ckpt_every_n: int = 1000, ckpt_max_to_keep: int = 7, record_summaries: bool = True, **kwargs) -> None: """Trains a model using eager + functions. This method: 1. Processes the pipeline configs 2. (Optionally) saves the as-run config 3. Builds the model & optimizer 4. Gets the training input data 5. Loads a fine-tuning detection or classification checkpoint if requested 6. Loops over the train data, executing distributed training steps inside tf.functions. 7. Checkpoints the model every `ckpt_every_n` training steps. 8. Logs the training metrics as TensorBoard summaries. Args: config_path: A path to a pipeline config file. model_dir: The directory to save checkpoints and summaries to. config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to override the config from `config_path`. train_steps: Number of training steps. If None, training steps from `TrainConfig` proto will be adopted. use_tpu: Boolean, whether training and evaluation should run on TPU. save_final_config: Whether to save final config (obtained after applying overrides) to `model_dir`. log_every_n: Log total loss every n training steps. ckpt_every_n: Checkpoint every n training steps. ckpt_max_to_keep: int, the number of most recent checkpoints to keep in the model directory. record_summaries: Boolean, whether or not to record summaries. **kwargs: Additional keyword arguments for configuration override. """ # parse config configs = config_util.get_configs_from_pipeline_file( config_path, config_override=config_override) kwargs.update({ 'train_steps': train_steps, 'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu, }) configs = config_util.merge_external_params_with_configs( configs, None, kwargs_dict=kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] unpad_gt_tensors = train_config.unpad_groundtruth_tensors add_regularization_loss = train_config.add_regularization_loss clip_gradient_norm = None if train_config.gradient_clipping_by_norm > 0: clip_gradient_norm = train_config.gradient_clipping_by_norm if kwargs['use_bfloat16']: tf.keras.mixed_precision.experimental.set_policy('mixed_bfloat16') if train_config.load_all_detection_checkpoint_vars: raise ValueError( 'train_pb2.load_all_detection_checkpoint_vars unsupported in TF2') # base checkpoint to fine-tune from config_util.update_fine_tune_checkpoint_type(train_config) base_ckpt = train_config.fine_tune_checkpoint base_ckpt_type = train_config.fine_tune_checkpoint_type base_ckpt_ver = train_config.fine_tune_checkpoint_version # write the as-run pipeline config to disk if save_final_config: pipeline_config_final = config_util.create_pipeline_proto_from_configs( configs) config_util.save_pipeline_config(pipeline_config_final, model_dir) # build model, input, optimizer strategy = tf.distribute.get_strategy() with strategy.scope(): # build model model = model_builder.build(model_config=model_config, is_training=True) # build input def train_dataset_fn( input_context: tf.distribute.InputContext) -> tf.data.Dataset: """Callable to create train input.""" train_input = inputs.train_input( train_config=train_config, train_input_config=train_input_config, model_config=model_config, model=model, input_context=input_context, ) train_input = train_input.repeat() return train_input train_input = strategy.experimental_distribute_datasets_from_function( train_dataset_fn) # build optimizer global_step = tf.Variable( 0, trainable=False, dtype=tf.int64, name='global_step', aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA, ) optimizer, (learning_rate, ) = optimizer_builder.build( train_config.optimizer, global_step=global_step) if callable(learning_rate): learning_rate_fn = learning_rate else: learning_rate_fn = lambda: learning_rate # prepare for training # get appropriate filepath (temporary or not) based on whether the worker is the chief summary_log_path = get_filepath(strategy, os.path.join(model_dir, 'train')) if record_summaries: summary_writer = tf.summary.create_file_writer(summary_log_path) else: summary_writer = tf.summary.create_noop_writer() if use_tpu: num_steps_per_iteration = 100 else: num_steps_per_iteration = 1 with summary_writer.as_default(): with strategy.scope(): with tf.summary.record_if( lambda: global_step % num_steps_per_iteration == 0): # prepare checkpoint manager # (do not use manager.latest_checkpoint as manager_dir is not model_dir while running in worker) ckpt = tf.train.Checkpoint(model=model, step=global_step, optimizer=optimizer) ckpt_max_to_keep = ckpt_max_to_keep if strategy.extended.should_checkpoint else 1 manager_dir = get_filepath(strategy, model_dir) manager = tf.train.CheckpointManager( ckpt, manager_dir, max_to_keep=ckpt_max_to_keep) latest_ckpt = tf.train.latest_checkpoint(model_dir) if latest_ckpt: # load latest checkpoint being trained ckpt.restore(latest_ckpt).expect_partial() elif base_ckpt: # load a pre-trained checkpoint load_base_ckpt(model, base_ckpt, base_ckpt_type, base_ckpt_ver, train_input, unpad_gt_tensors) # get trainable variables train_vars = get_train_vars(model, train_config) # define training step def train_step_fn(features: Dict, labels: Dict): """Single train step.""" loss = eager_train_step( model, train_vars, features, labels, unpad_gt_tensors, optimizer, learning_rate=learning_rate_fn(), add_regularization_loss=add_regularization_loss, clip_gradient_norm=clip_gradient_norm, global_step=global_step, num_replicas=strategy.num_replicas_in_sync, ) global_step.assign_add(1) return loss def _sample_and_train(strategy, train_step_fn, data_iterator): features, labels = data_iterator.next() per_replica_losses = strategy.run(train_step_fn, args=(features, labels)) return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) @tf.function def _dist_train_step(data_iterator): """A distributed train step.""" if num_steps_per_iteration > 1: for _ in tf.range(num_steps_per_iteration - 1): with tf.name_scope(''): _sample_and_train(strategy, train_step_fn, data_iterator) return _sample_and_train(strategy, train_step_fn, data_iterator) train_input_iter = iter(train_input) # save initialized version of checkpoint if int(global_step.value()) == 0: manager.save() ckpt_step = int(global_step.value()) logged_step = global_step.value() # proceed with training last_step_time = time.time() for _ in range(global_step.value(), train_config.num_steps, num_steps_per_iteration): # execute a step (forward pass + backward pass) loss = _dist_train_step(train_input_iter) # log time curr_step = global_step.value() time_taken = time.time() - last_step_time last_step_time = time.time() tf.summary.scalar( 'steps_per_sec', num_steps_per_iteration * 1.0 / time_taken, step=global_step, ) # log loss if curr_step - logged_step >= log_every_n: step_time = time_taken / num_steps_per_iteration step_msg = 'Step {} per-step time {:.3f}s loss={:.3f}'.format( curr_step, step_time, loss) v1.logging.info(step_msg) logged_step = curr_step # save checkpoint regularly if (curr_step - ckpt_step) >= ckpt_every_n: manager.save() ckpt_step = curr_step # remove checkpoint directories of non-chief workers that MultiWorkerMirroredStrategy forces us to save during sync # distributed training. clean_temporary_directories(strategy, manager_dir) clean_temporary_directories(strategy, summary_log_path)
def populate_experiment(run_config, hparams, pipeline_config_path, train_steps=None, eval_steps=None, model_fn_creator=create_model_fn, **kwargs): configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) configs = config_util.merge_external_params_with_configs( configs, hparams, train_steps=train_steps, eval_steps=eval_steps, **kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] eval_config = configs['eval_config'] eval_input_config = configs['eval_input_config'] if train_steps is None: train_steps = train_config.num_steps if train_config.num_steps else None if eval_steps is None: eval_steps = eval_config.num_examples if eval_config.num_examples else None detection_model_fn = functools.partial( model_builder.build, model_config=model_config) train_input_fn = inputs.create_train_input_fn( train_config=train_config, train_input_config=train_input_config, model_config=model_config) eval_input_fn = inputs.create_eval_input_fn( eval_config=eval_config, eval_input_config=eval_input_config, model_config=model_config) export_strategies = [ tf.contrib.learn.utils.saved_model_export_utils.make_export_strategy( serving_input_fn=inputs.create_predict_input_fn( model_config=model_config)) ] estimator = tf.estimator.Estimator( model_fn=model_fn_creator(detection_model_fn, configs, hparams), config=run_config) if run_config.is_chief: pipeline_config_final = config_util.create_pipeline_proto_from_configs( configs) pipeline_config_final_path = os.path.join(estimator.model_dir, 'pipeline.config') config_text = text_format.MessageToString(pipeline_config_final) with tf.gfile.Open(pipeline_config_final_path, 'wb') as f: tf.logging.info('Writing as-run pipeline config file to %s', pipeline_config_final_path) f.write(config_text) return tf.contrib.learn.Experiment( estimator=estimator, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, train_steps=train_steps, eval_steps=eval_steps, export_strategies=export_strategies, eval_delay_secs=120,)
def populate_experiment(run_config, hparams, pipeline_config_path, train_steps=None, eval_steps=None, model_fn_creator=create_model_fn, **kwargs): """Populates an `Experiment` object. Args: run_config: A `RunConfig`. hparams: A `HParams`. pipeline_config_path: A path to a pipeline config file. train_steps: Number of training steps. If None, the number of training steps is set from the `TrainConfig` proto. eval_steps: Number of evaluation steps per evaluation cycle. If None, the number of evaluation steps is set from the `EvalConfig` proto. model_fn_creator: A function that creates a `model_fn` for `Estimator`. Follows the signature: * Args: * `detection_model_fn`: Function that returns `DetectionModel` instance. * `configs`: Dictionary of pipeline config objects. * `hparams`: `HParams` object. * Returns: `model_fn` for `Estimator`. **kwargs: Additional keyword arguments for configuration override. Returns: An `Experiment` that defines all aspects of training, evaluation, and export. """ configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) configs = config_util.merge_external_params_with_configs( configs, hparams, train_steps=train_steps, eval_steps=eval_steps, **kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] eval_config = configs['eval_config'] eval_input_config = configs['eval_input_config'] if train_steps is None: train_steps = train_config.num_steps if train_config.num_steps else None if eval_steps is None: eval_steps = eval_config.num_examples if eval_config.num_examples else None detection_model_fn = functools.partial( model_builder.build, model_config=model_config) # Create the input functions for TRAIN/EVAL. train_input_fn = inputs.create_train_input_fn( train_config=train_config, train_input_config=train_input_config, model_config=model_config) eval_input_fn = inputs.create_eval_input_fn( eval_config=eval_config, eval_input_config=eval_input_config, model_config=model_config) export_strategies = [ tf.contrib.learn.utils.saved_model_export_utils.make_export_strategy( serving_input_fn=inputs.create_predict_input_fn( model_config=model_config)) ] estimator = tf.estimator.Estimator( model_fn=model_fn_creator(detection_model_fn, configs, hparams), config=run_config) if run_config.is_chief: # Store the final pipeline config for traceability. pipeline_config_final = config_util.create_pipeline_proto_from_configs( configs) pipeline_config_final_path = os.path.join(estimator.model_dir, 'pipeline.config') config_text = text_format.MessageToString(pipeline_config_final) with tf.gfile.Open(pipeline_config_final_path, 'wb') as f: tf.logging.info('Writing as-run pipeline config file to %s', pipeline_config_final_path) f.write(config_text) return tf.contrib.learn.Experiment( estimator=estimator, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, train_steps=train_steps, eval_steps=eval_steps, export_strategies=export_strategies, eval_delay_secs=120,)
def set_config(config_path: Union[str, Path], checkpoint_path: Union[str, Path], tf_records_train_path: Union[str, Path], label_map: Dict[str, int], label_map_filepath: Union[str, Path], batch_size: int, max_box_predictions: int, max_number_of_boxes: int, fine_tune_checkpoint_type: str = 'detection', augment_path: str = None, min_dimension: int = None, max_dimension: int = None, total_steps: int = None, warmup_steps: int = None, num_steps: int = None): logger.info(f"Set configs {config_path}...") configs = get_configs_from_pipeline_file(str(config_path)) train_len = count_tfrecord_examples(str(tf_records_train_path)) logger.info(f"Train has {train_len} tf_records.") num_classes = len(set(label_map.values())) _, config_model = configs['model'].ListFields()[0] config_model.num_classes = num_classes configs[ 'model'].center_net.object_center_params.max_box_predictions = max_box_predictions if min_dimension is not None: configs[ 'model'].center_net.image_resizer.keep_aspect_ratio_resizer.min_dimension = min_dimension if max_dimension is not None: configs[ 'model'].center_net.image_resizer.keep_aspect_ratio_resizer.max_dimension = max_dimension configs[ 'train_config'].fine_tune_checkpoint_type = fine_tune_checkpoint_type configs['train_config'].fine_tune_checkpoint = str(checkpoint_path) configs['train_config'].batch_size = batch_size configs['train_config'].max_number_of_boxes = max_number_of_boxes if total_steps is not None: configs[ 'train_config'].optimizer.adam_optimizer.learning_rate.cosine_decay_learning_rate.total_steps = total_steps if warmup_steps is not None: configs[ 'train_config'].optimizer.adam_optimizer.learning_rate.cosine_decay_learning_rate.warmup_steps = warmup_steps if num_steps is not None: configs['train_config'].num_steps = num_steps if augment_path is not None: augment_config = configs['train_config'].data_augmentation_options for _ in augment_config: augment_config.pop() augment = text_format.Merge(str(augment_path), pipeline_pb2.TrainEvalPipelineConfig()) augment_config.extend(augment.train_config.data_augmentation_options) label_map_to_file(label_map=label_map, filepath=label_map_filepath) def clear_repeated_proto(proto): for _ in proto: proto.pop() configs['train_input_config'].label_map_path = str(label_map_filepath) clear_repeated_proto( configs['train_input_config'].tf_record_input_reader.input_path) configs['train_input_config'].tf_record_input_reader.input_path.append( str(tf_records_train_path)) pipeline_proto = create_pipeline_proto_from_configs(configs) save_pipeline_config(pipeline_proto, str(config_path.parent)) logger.info(f"Config {config_path} changed")