def export_inference_graph(input_type, pipeline_config, trained_checkpoint_prefix, output_directory, input_shape=None, output_collection_name='inference_op', additional_output_tensor_names=None, write_inference_graph=False): detection_model = model_builder.build(pipeline_config.model, is_training=False) graph_rewriter_fn = None if pipeline_config.HasField('graph_rewriter'): graph_rewriter_config = pipeline_config.graph_rewriter graph_rewriter_fn = graph_rewriter_builder.build(graph_rewriter_config, is_training=False) _export_inference_graph(input_type, detection_model, pipeline_config.eval_config.use_moving_averages, trained_checkpoint_prefix, output_directory, additional_output_tensor_names, input_shape, output_collection_name, graph_hook_fn=graph_rewriter_fn, write_inference_graph=write_inference_graph) pipeline_config.eval_config.use_moving_averages = False config_util.save_pipeline_config(pipeline_config, output_directory)
def populate_config(settings): """Fill the base config file with settings and save new version.""" print '...Reading base config file' configs = config_util.get_configs_from_pipeline_file( settings['paths']['base_config']) print '...Updating config settings' hparams = tf.contrib.training.HParams( **{ "model.ssd.num_classes": 1, "train_config.fine_tune_checkpoint": settings['config']['train_config']['fine_tune_checkpoint'], "train_config.num_steps": settings['config']['train_config']['num_steps'], "eval_config.num_examples": settings['config']['eval_config']['num_examples'], "label_map_path": settings['config']['label_map_path'] }) configs = config_util.merge_external_params_with_configs(configs, hparams) configs['train_input_config'].tf_record_input_reader.input_path[ 0] = settings['config']['train_input_reader'][ 'tf_record_input_reader']['input_path'] configs['eval_input_config'].tf_record_input_reader.input_path[ 0] = settings['config']['eval_input_reader']['tf_record_input_reader'][ 'input_path'] print '...Writing new config file' pipeline_config = config_util.create_pipeline_proto_from_configs(configs) config_util.save_pipeline_config(pipeline_config, settings['dirs']['pipeline'])
def create_config_file(input_path, config_params, network_type): configs = config_util.get_configs_from_pipeline_file(input_path) if config_params['checkpoint_path'] is not None: prefix = "" for ckpt_file in os.listdir( os.path.join('/checkpoints/' + network_type, config_params['checkpoint_path'])): if ckpt_file.endswith(".index"): prefix = ckpt_file.split(".index")[0] config_params[ 'checkpoint_path'] = '/checkpoints/' + network_type + '/' + config_params[ 'checkpoint_path'] + '/' + prefix else: config_params[ 'checkpoint_path'] = '/weights/' + network_type + '/model.ckpt' new_configs = None if network_type == "ssd_mobilenet" or network_type == "ssd_inception": new_configs = config_ssd_mobilenet_inception(configs, config_params) elif network_type == "ssd_resnet_50" or network_type == "ssd_fpn": new_configs = config_ssd_mobilenet_inception(configs, config_params) elif network_type == "frcnn_resnet_50" or network_type == "frcnn_resnet_101": new_configs = config_frcnn_resnet_50_101(configs, config_params) pipeline_config = config_util.create_pipeline_proto_from_configs( new_configs) config_util.save_pipeline_config(pipeline_config, '/training_dir/model')
def train(self, epochs=100, val_split=0.3, clear_folder=False, override_pipeline=False, eval=False): try: if clear_folder: FileUtil.clear_folder(self._out_folder) self.num_steps = epochs self._mk_labels_map() self._mk_records(val_split) # update pipeline self._out_folder.joinpath(os.path.sep.join( ["export", "Servo"])).mkdir(exist_ok=True, parents=True) # merge pipelines save_pipeline_config(self.pipeline, str(self._out_folder)) # start training tf.logging.set_verbosity(tf.logging.INFO) if eval: self._train_and_eval() else: self._train() except Exception as ex: raise Exception("Error training the model : {}".format(ex)) from ex return super(TfTrainableModel, self).train()
def export_inference_graph(input_type, pipeline_config, trained_checkpoint_prefix, output_directory, input_shape=None, output_collection_name='inference_op', additional_output_tensor_names=None, write_inference_graph=False, use_side_inputs=False, side_input_shapes=None, side_input_names=None, side_input_types=None): """Exports inference graph for the model specified in the pipeline config. Args: input_type: Type of input for the graph. Can be one of ['image_tensor', 'encoded_image_string_tensor', 'tf_example']. pipeline_config: pipeline_pb2.TrainAndEvalPipelineConfig proto. trained_checkpoint_prefix: Path to the trained checkpoint file. output_directory: Path to write outputs. input_shape: Sets a fixed shape for an `image_tensor` input. If not specified, will default to [None, None, None, 3]. output_collection_name: Name of collection to add output tensors to. If None, does not add output tensors to a collection. additional_output_tensor_names: list of additional output tensors to include in the frozen graph. write_inference_graph: If true, writes inference graph to disk. use_side_inputs: If True, the model requires side_inputs. side_input_shapes: List of shapes of the side input tensors, required if use_side_inputs is True. side_input_names: List of names of the side input tensors, required if use_side_inputs is True. side_input_types: List of types of the side input tensors, required if use_side_inputs is True. """ detection_model = model_builder.build(pipeline_config.model, is_training=False) graph_rewriter_fn = None if pipeline_config.HasField('graph_rewriter'): graph_rewriter_config = pipeline_config.graph_rewriter graph_rewriter_fn = graph_rewriter_builder.build(graph_rewriter_config, is_training=False) _export_inference_graph( input_type, detection_model, pipeline_config.eval_config.use_moving_averages, trained_checkpoint_prefix, output_directory, additional_output_tensor_names, input_shape, output_collection_name, graph_hook_fn=graph_rewriter_fn, write_inference_graph=write_inference_graph, use_side_inputs=use_side_inputs, side_input_shapes=side_input_shapes, side_input_names=side_input_names, side_input_types=side_input_types) pipeline_config.eval_config.use_moving_averages = False config_util.save_pipeline_config(pipeline_config, output_directory)
def generate_pipeline_config(self): configs = config_util.get_configs_from_pipeline_file( self.base_pipeline_config) tf_hparams = tf.contrib.training.HParams(**self.hparams) config_util.merge_external_params_with_configs(configs, tf_hparams) pipeline_config = config_util.create_pipeline_proto_from_configs( configs) config_util.save_pipeline_config(pipeline_config, self.prefix) return os.path.join(self.prefix, 'pipeline.config')
def export_inference_graph(input_type, pipeline_config, trained_checkpoint_dir, output_directory): """Exports inference graph for the model specified in the pipeline config. This function creates `output_directory` if it does not already exist, which will hold a copy of the pipeline config with filename `pipeline.config`, and two subdirectories named `checkpoint` and `saved_model` (containing the exported checkpoint and SavedModel respectively). Args: input_type: Type of input for the graph. Can be one of ['image_tensor', 'encoded_image_string_tensor', 'tf_example']. pipeline_config: pipeline_pb2.TrainAndEvalPipelineConfig proto. trained_checkpoint_dir: Path to the trained checkpoint file. output_directory: Path to write outputs. Raises: ValueError: if input_type is invalid. """ output_checkpoint_directory = os.path.join(output_directory, 'checkpoint') output_saved_model_directory = os.path.join(output_directory, 'saved_model') detection_model = model_builder.build(pipeline_config.model, is_training=False) ckpt = tf.train.Checkpoint( model=detection_model) manager = tf.train.CheckpointManager( ckpt, trained_checkpoint_dir, max_to_keep=1) status = ckpt.restore(manager.latest_checkpoint).expect_partial() module_dict = { 'image_tensor': DetectionFromImageModule, 'encoded_image_string_tensor': DetectionFromEncodedImageModule, 'tf_example': DetectionFromTFExampleModule } if input_type not in module_dict: raise ValueError('Unrecognized `input_type`') detection_module = module_dict[input_type](detection_model) # Getting the concrete function traces the graph and forces variables to # be constructed --- only after this can we save the checkpoint and # saved model. concrete_function = detection_module.__call__.get_concrete_function() status.assert_existing_objects_matched() exported_checkpoint_manager = tf.train.CheckpointManager( ckpt, output_checkpoint_directory, max_to_keep=1) exported_checkpoint_manager.save(checkpoint_number=0) tf.saved_model.save(detection_module, output_saved_model_directory, signatures=concrete_function) config_util.save_pipeline_config(pipeline_config, output_directory)
def override_pipeline(pipeline, override_dict, num_classes=0): configs = config_util.get_configs_from_pipeline_file(pipeline) meta_arch = configs["model"].WhichOneof("model") override_dict['model.{}.num_classes'.format(meta_arch)] = num_classes configs = config_util.merge_external_params_with_configs( configs, kwargs_dict=override_dict) pipeline_config = config_util.create_pipeline_proto_from_configs(configs) config_util.save_pipeline_config(pipeline_config, os.environ['RESULT_DIR'])
def write_configuration(self, configuration_pipeline: Dict[str, str]) -> None: try: pipeline_config = create_pipeline_proto_from_configs( configuration_pipeline) except Exception as e: raise ConfigurationBodyCorrupter(additional_message=e.__str__()) try: save_pipeline_config(pipeline_config, self.path.model_dir) except Exception: raise ModelTrainingPathNotFound( training_model_path=self.path.model_dir)
def update_pipeline_config(tfrecords_dir): org_pipeline_config_file = os.path.join(FLAGS.base_model_dir, 'pipeline.config') logger.info('original pipeline.config {}'.format(org_pipeline_config_file)) cfg = config_util.get_configs_from_pipeline_file(org_pipeline_config_file) #update num_of_classes model_name = os.path.basename(os.path.normpath( FLAGS.base_model_dir)).lower() if model_name.startswith("ssd"): model_cfg = cfg['model'].ssd logger.info('found a ssd base model') elif model_name.startswith("faster_rcnn"): model_cfg = cfg['model'].faster_rcnn logger.info('found a faster_rcnn base model') else: raise ValueError( 'unknown base model {}, we can only handle ssd nor faster_rcnn'. format(model_name)) pascal_label_map_file = os.path.join(tfrecords_dir, 'pascal_label_map.pbtxt') label_map_dict = label_map_util.get_label_map_dict(pascal_label_map_file) num_classes = len(label_map_dict) logger.info('num_of_classes from {} to {}'.format(model_cfg.num_classes, num_classes)) model_cfg.num_classes = num_classes #update base_model_dir train_cfg = cfg['train_config'] train_cfg.fine_tune_checkpoint = os.path.join(FLAGS.base_model_dir, 'model.ckpt') logger.info('fine_tune_checkpoint: {}'.format( train_cfg.fine_tune_checkpoint)) #update num_train_steps, label_map_path, train_tfrecords, val_tfrecords hparams = tf.contrib.training.HParams( train_steps=FLAGS.num_steps, label_map_path=pascal_label_map_file, train_input_path=os.path.join(tfrecords_dir, 'train.record'), eval_input_path=os.path.join(tfrecords_dir, 'val.record')) cfg = config_util.merge_external_params_with_configs(cfg, hparams) updated_pipeline_config = config_util.create_pipeline_proto_from_configs( cfg) updated_pipeline_config_file = os.path.join(tfrecords_dir, 'pipeline.config') config_util.save_pipeline_config(updated_pipeline_config, tfrecords_dir) logger.info('updated pipeline.config {}'.format(tfrecords_dir)) return updated_pipeline_config, updated_pipeline_config_file
def test_save_pipeline_config(self): """Tests that the pipeline config is properly saved to disk.""" pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() pipeline_config.model.faster_rcnn.num_classes = 10 pipeline_config.train_config.batch_size = 32 pipeline_config.train_input_reader.label_map_path = "path/to/label_map" pipeline_config.eval_config.num_examples = 20 pipeline_config.eval_input_reader.queue_capacity = 100 config_util.save_pipeline_config(pipeline_config, self.get_temp_dir()) configs = config_util.get_configs_from_pipeline_file( os.path.join(self.get_temp_dir(), "pipeline.config")) pipeline_config_reconstructed = ( config_util.create_pipeline_proto_from_configs(configs)) self.assertEqual(pipeline_config, pipeline_config_reconstructed)
def create_model(pipeline_config_path, output_directory, checkpoint_path): tf.keras.backend.clear_session() print('Building model and restoring weights for fine-tuning...', flush=True) num_classes = 1 output_checkpoint_dir = os.path.join(output_directory, 'checkpoint') configs = config_util.get_configs_from_pipeline_file(pipeline_config_path) model_config = configs['model'] model_config.ssd.num_classes = num_classes model_config.ssd.freeze_batchnorm = True detection_model = model_builder.build(model_config=model_config, is_training=True) pipeline_proto = config_util.create_pipeline_proto_from_configs(configs) config_util.save_pipeline_config(pipeline_proto, output_directory) latest_checkpoint_number = int(checkpoint_path.split('-')[-1]) print(latest_checkpoint_number) if latest_checkpoint_number == 0: fake_box_predictor = tf.compat.v2.train.Checkpoint( _base_tower_layers_for_heads=detection_model._box_predictor. _base_tower_layers_for_heads, # _prediction_heads=detection_model._box_predictor._prediction_heads, # (i.e., the classification head that we *will not* restore) _box_prediction_head=detection_model._box_predictor. _box_prediction_head, ) fake_model = tf.compat.v2.train.Checkpoint( _feature_extractor=detection_model._feature_extractor, _box_predictor=fake_box_predictor) ckpt = tf.compat.v2.train.Checkpoint(model=fake_model) ckpt.restore(checkpoint_path).expect_partial() exported_ckpt = tf.compat.v2.train.Checkpoint(model=detection_model) ckpt_manager = tf.train.CheckpointManager(exported_ckpt, output_checkpoint_dir, max_to_keep=1) if latest_checkpoint_number > 0: status = exported_ckpt.restore(ckpt_manager.latest_checkpoint) image, shapes = detection_model.preprocess(tf.zeros([1, 320, 320, 3])) prediction_dict = detection_model.predict(image, shapes) _ = detection_model.postprocess(prediction_dict, shapes) print('Weights restored!') return detection_model, pipeline_proto, ckpt_manager
def export_inference_graph(input_type, pipeline_config, trained_checkpoint_prefix, output_directory, input_shape=None, output_collection_name='inference_op', additional_output_tensor_names=None, write_inference_graph=False): """Exports inference graph for the model specified in the pipeline config. Args: input_type: Type of input for the graph. Can be one of ['image_tensor', 'encoded_image_string_tensor', 'tf_example']. pipeline_config: pipeline_pb2.TrainAndEvalPipelineConfig proto. trained_checkpoint_prefix: Path to the trained checkpoint file. output_directory: Path to write outputs. input_shape: Sets a fixed shape for an `image_tensor` input. If not specified, will default to [None, None, None, 3]. output_collection_name: Name of collection to add output tensors to. If None, does not add output tensors to a collection. additional_output_tensor_names: list of additional output tensors to include in the frozen graph. write_inference_graph: If true, writes inference graph to disk. """ detection_model = model_builder.build(pipeline_config.model, is_training=False) graph_rewriter_fn = None if pipeline_config.HasField('graph_rewriter'): graph_rewriter_config = pipeline_config.graph_rewriter graph_rewriter_fn = graph_rewriter_builder.build(graph_rewriter_config, is_training=False) _export_inference_graph( input_type, detection_model, pipeline_config.eval_config.use_moving_averages, trained_checkpoint_prefix, output_directory, additional_output_tensor_names, input_shape, output_collection_name, graph_hook_fn=graph_rewriter_fn, write_inference_graph=write_inference_graph) pipeline_config.eval_config.use_moving_averages = False config_util.save_pipeline_config(pipeline_config, output_directory)
def get_configuration_content(self, network_info: NetworkInformation) -> str: try: network_path: str = os.path.join(self.path.weights_dir, network_info.network_architecture, "pipeline.config") config_file_content: Dict[str, str] = get_configs_from_pipeline_file(network_path) checkpoint_path = os.path.join(self.path.weights_dir, network_info.network_architecture, 'checkpoint/ckpt-0') content: Dict[str, str] = self._adjust_configuration_content(config_file_content=config_file_content, network_path=checkpoint_path) # the return of proto dict make error so we save the file and read it with python reader pipeline_config = create_pipeline_proto_from_configs(content) save_pipeline_config(pipeline_config, "/tmp/") content_str: str = open("/tmp/pipeline.config", "r").read() return content_str except Exception as e: raise ConfigurationPipelineNotFound(additional_message=e.__str__(), pipeline_path=network_path)
def override_pipeline_configs(config_file, overrides, out_dir=""): configs = config_util.get_configs_from_pipeline_file(config_file) configs['train_config'].from_detection_checkpoint = True configs['eval_config'].num_examples = 25000 for field, value in overrides.items(): if field == "num_classes": set_number_of_classes(configs['model'], value) elif field == "width_height": set_resizer_width_height(configs['model'], value[0], value[1]) elif not config_util._maybe_update_config_with_key_value( configs, field, value): try: config_util._update_generic(configs, field, value) except ValueError as ex: if field == "train_config.fine_tune_checkpoint": configs['train_config'].fine_tune_checkpoint = value else: raise config_util.save_pipeline_config( config_util.create_pipeline_proto_from_configs(configs), out_dir)
def patch_pipeline_config(self, model_base_name): self.label_map_path = os.path.join(self.src_train_path, "label_map.pbtxt") model_base_dir_path = os.path.join(self.path_perm_storage, "model_base_checkpoints", model_base_name) config_path = os.path.join(model_base_dir_path, "pipeline.config") cf_dict = config_util.get_configs_from_pipeline_file(config_path) cf_dict["model"].ssd.num_classes = self.num_classes cf_dict["train_config"].fine_tune_checkpoint = os.path.join( model_base_dir_path, "ckpt-0") cf_dict["train_config"].batch_size = self.batch_size cf_dict["train_config"].use_bfloat16 = False cf_dict["train_input_config"].label_map_path = self.label_map_path cf_dict[ "train_input_config"].tf_record_input_reader.input_path[:] = self.scan_dir_for_records( DatasetType.training.name) cf_dict["eval_input_config"].label_map_path = self.label_map_path cf_dict[ "eval_input_config"].tf_record_input_reader.input_path[:] = self.scan_dir_for_records( DatasetType.evaluation.name) cf_obj = config_util.create_pipeline_proto_from_configs(cf_dict) tmp_config_path = os.path.join(self.path_perm_storage, "patched_config") config_util.save_pipeline_config(cf_obj, tmp_config_path) self.patched_config_path = os.path.join(tmp_config_path, "pipeline.config") print("Source configuration was patched: {0}".format( self.patched_config_path))
def train_loop( pipeline_config_path, model_dir, val_checkpoint_dir, config_override=None, train_steps=None, use_tpu=False, save_final_config=False, checkpoint_every_n=1000, checkpoint_max_to_keep=7, record_summaries=True, performance_summary_exporter=None, **kwargs): """Trains a model using eager + functions. This method: 1. Processes the pipeline configs 2. (Optionally) saves the as-run config 3. Builds the model & optimizer 4. Gets the training input data 5. Loads a fine-tuning detection or classification checkpoint if requested 6. Loops over the train data, executing distributed training steps inside tf.functions. 7. Checkpoints the model every `checkpoint_every_n` training steps. 8. Logs the training metrics as TensorBoard summaries. Args: pipeline_config_path: A path to a pipeline config file. model_dir: The directory to save checkpoints and summaries to. val_checkpoint_dir: The directory to save validation checkpoint. config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to override the config from `pipeline_config_path`. train_steps: Number of training steps. If None, the number of training steps is set from the `TrainConfig` proto. use_tpu: Boolean, whether training and evaluation should run on TPU. save_final_config: Whether to save final config (obtained after applying overrides) to `model_dir`. checkpoint_every_n: Checkpoint every n training steps. checkpoint_max_to_keep: int, the number of most recent checkpoints to keep in the model directory. record_summaries: Boolean, whether or not to record summaries. performance_summary_exporter: function for exporting performance metrics. **kwargs: Additional keyword arguments for configuration override. """ print('START train looop function ========================') ## Parse the configs get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[ 'get_configs_from_pipeline_file'] merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[ 'merge_external_params_with_configs'] create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[ 'create_pipeline_proto_from_configs'] steps_per_sec_list = [] configs = get_configs_from_pipeline_file( pipeline_config_path, config_override=config_override) kwargs.update({ 'train_steps': train_steps, 'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu }) configs = merge_external_params_with_configs( configs, None, kwargs_dict=kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors add_regularization_loss = train_config.add_regularization_loss clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm # update train_steps from config but only when non-zero value is provided if train_steps is None and train_config.num_steps != 0: train_steps = train_config.num_steps if kwargs['use_bfloat16']: tf.compat.v2.keras.mixed_precision.experimental.set_policy('mixed_bfloat16') if train_config.load_all_detection_checkpoint_vars: raise ValueError('train_pb2.load_all_detection_checkpoint_vars ' 'unsupported in TF2') config_util.update_fine_tune_checkpoint_type(train_config) fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type fine_tune_checkpoint_version = train_config.fine_tune_checkpoint_version # Write the as-run pipeline config to disk. if save_final_config: tf.logging.info('Saving pipeline config file to directory {}'.format( model_dir)) pipeline_config_final = create_pipeline_proto_from_configs(configs) config_util.save_pipeline_config(pipeline_config_final, model_dir) # Build the model, optimizer, and training input strategy = tf.compat.v2.distribute.get_strategy() with strategy.scope(): detection_model = MODEL_BUILD_UTIL_MAP['detection_model_fn_base']( model_config=model_config, is_training=True) def train_dataset_fn(input_context): """Callable to create train input.""" # Create the inputs. train_input = inputs.train_input( train_config=train_config, train_input_config=train_input_config, model_config=model_config, model=detection_model, input_context=input_context) train_input = train_input.repeat() return train_input train_input = strategy.experimental_distribute_datasets_from_function( train_dataset_fn) global_step = tf.Variable( 0, trainable=False, dtype=tf.compat.v2.dtypes.int64, name='global_step', aggregation=tf.compat.v2.VariableAggregation.ONLY_FIRST_REPLICA) optimizer, (learning_rate,) = optimizer_builder.build( train_config.optimizer, global_step=global_step) # We run the detection_model on dummy inputs in order to ensure that the # model and all its variables have been properly constructed. Specifically, # this is currently necessary prior to (potentially) creating shadow copies # of the model variables for the EMA optimizer. if train_config.optimizer.use_moving_average: _ensure_model_is_built(detection_model, train_input, unpad_groundtruth_tensors) optimizer.shadow_copy(detection_model) if callable(learning_rate): learning_rate_fn = learning_rate else: learning_rate_fn = lambda: learning_rate ## Train the model # Get the appropriate filepath (temporary or not) based on whether the worker # is the chief. summary_writer_filepath = get_filepath(strategy, os.path.join(model_dir, 'train')) if record_summaries: summary_writer = tf.compat.v2.summary.create_file_writer( summary_writer_filepath) else: summary_writer = tf2.summary.create_noop_writer() if use_tpu: num_steps_per_iteration = 100 else: # TODO(b/135933080) Explore setting to 100 when GPU performance issues # are fixed. num_steps_per_iteration = 1 with summary_writer.as_default(): with strategy.scope(): with tf.compat.v2.summary.record_if( lambda: global_step % num_steps_per_iteration == 0): # Load a fine-tuning checkpoint. if train_config.fine_tune_checkpoint: load_fine_tune_checkpoint( detection_model, train_config.fine_tune_checkpoint, fine_tune_checkpoint_type, fine_tune_checkpoint_version, train_config.run_fine_tune_checkpoint_dummy_computation, train_input, unpad_groundtruth_tensors) ckpt = tf.compat.v2.train.Checkpoint( step=global_step, model=detection_model, optimizer=optimizer) val_ckpt = tf.compat.v2.train.Checkpoint( step=global_step, model=detection_model, optimizer=optimizer) manager_dir = get_filepath(strategy, model_dir) val_manager_dir = get_filepath(strategy, val_checkpoint_dir) # if not strategy.extended.should_checkpoint: # checkpoint_max_to_keep = 1 checkpoint_max_to_keep = 1 manager = tf.compat.v2.train.CheckpointManager( ckpt, manager_dir, max_to_keep=checkpoint_max_to_keep) val_manager = tf.compat.v2.train.CheckpointManager( val_ckpt, val_manager_dir, max_to_keep=checkpoint_max_to_keep) model_checkpoint_callback = tfc.ModelCheckpoint(val_manager) early_stopping_callback = tfc.EarlyStopping(min_delta=0.0001, patience=5, mode='min') train_logger_callback = tfc.TrainLogger(model_dir, 'logs.txt') cancellation_point = tfc.CancellationPoint() # We use the following instead of manager.latest_checkpoint because # manager_dir does not point to the model directory when we are running # in a worker. latest_checkpoint = tf.train.latest_checkpoint(model_dir) ckpt.restore(latest_checkpoint) val_ckpt.restore(latest_checkpoint) def train_step_fn(features, labels): """Single train step.""" loss = eager_train_step( detection_model, features, labels, unpad_groundtruth_tensors, optimizer, learning_rate=learning_rate_fn(), add_regularization_loss=add_regularization_loss, clip_gradients_value=clip_gradients_value, global_step=global_step, num_replicas=strategy.num_replicas_in_sync) global_step.assign_add(1) return loss def _sample_and_train(strategy, train_step_fn, data_iterator): features, labels = data_iterator.next() if hasattr(tf.distribute.Strategy, 'run'): per_replica_losses = strategy.run( train_step_fn, args=(features, labels)) else: per_replica_losses = strategy.experimental_run_v2( train_step_fn, args=(features, labels)) # TODO(anjalisridhar): explore if it is safe to remove the ## num_replicas scaling of the loss and switch this to a ReduceOp.Mean return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) @tf.function def _dist_train_step(data_iterator): """A distributed train step.""" if num_steps_per_iteration > 1: for _ in tf.range(num_steps_per_iteration - 1): # Following suggestion on yaqs/5402607292645376 with tf.name_scope(''): _sample_and_train(strategy, train_step_fn, data_iterator) return _sample_and_train(strategy, train_step_fn, data_iterator) train_input_iter = iter(train_input) if int(global_step.value()) == 0: manager.save() checkpointed_step = int(global_step.value()) logged_step = global_step.value() # num_epochs = (train_steps - global_step.value()) // num_steps_per_iteration last_step_time = time.time() for epoch, _ in enumerate(range(global_step.value(), train_steps, num_steps_per_iteration)): loss = _dist_train_step(train_input_iter) time_taken = time.time() - last_step_time last_step_time = time.time() steps_per_sec = num_steps_per_iteration * 1.0 / time_taken tf.compat.v2.summary.scalar( 'steps_per_sec', steps_per_sec, step=global_step) steps_per_sec_list.append(steps_per_sec) if global_step.value() - logged_step >= 100: tf.logging.info( 'Step {} per-step time {:.3f}s loss={:.3f}'.format( global_step.value(), time_taken / num_steps_per_iteration, loss)) manager.save() checkpointed_step = int(global_step.value()) log_metrics = eval_continuously(pipeline_config_path, model_dir=model_dir, checkpoint_dir=model_dir, timeout=20) log_metrics['train_total_loss'] = loss model_checkpoint_callback.step(epoch, log_metrics['Loss/total_loss']) stop_training = early_stopping_callback.step(epoch, log_metrics['Loss/total_loss']) train_logger_callback.log(log_metrics) if stop_training or cancellation_point.check(): break print(log_metrics) logged_step = global_step.value() # Remove the checkpoint directories of the non-chief workers that # MultiWorkerMirroredStrategy forces us to save during sync distributed # training. clean_temporary_directories(strategy, manager_dir) clean_temporary_directories(strategy, summary_writer_filepath) # TODO(pkanwar): add accuracy metrics. if performance_summary_exporter is not None: metrics = { 'steps_per_sec': np.mean(steps_per_sec_list), 'steps_per_sec_p50': np.median(steps_per_sec_list), 'steps_per_sec_max': max(steps_per_sec_list), 'last_batch_loss': float(loss) } mixed_precision = 'bf16' if kwargs['use_bfloat16'] else 'fp32' performance_summary_exporter(metrics, mixed_precision)
def export_inference_graph(input_type, pipeline_config, trained_checkpoint_dir, output_directory, use_side_inputs=False, side_input_shapes='', side_input_types='', side_input_names=''): """Exports inference graph for the model specified in the pipeline config. This function creates `output_directory` if it does not already exist, which will hold a copy of the pipeline config with filename `pipeline.config`, and two subdirectories named `checkpoint` and `saved_model` (containing the exported checkpoint and SavedModel respectively). Args: input_type: Type of input for the graph. Can be one of ['image_tensor', 'encoded_image_string_tensor', 'tf_example']. pipeline_config: pipeline_pb2.TrainAndEvalPipelineConfig proto. trained_checkpoint_dir: Path to the trained checkpoint file. output_directory: Path to write outputs. use_side_inputs: boolean that determines whether side inputs should be included in the input signature. side_input_shapes: forward-slash-separated list of comma-separated lists describing input shapes. side_input_types: comma-separated list of the types of the inputs. side_input_names: comma-separated list of the names of the inputs. Raises: ValueError: if input_type is invalid. """ output_checkpoint_directory = os.path.join(output_directory, 'checkpoint') output_saved_model_directory = os.path.join(output_directory, 'saved_model') detection_model = INPUT_BUILDER_UTIL_MAP['model_build']( pipeline_config.model, is_training=False) ckpt = tf.train.Checkpoint( model=detection_model) manager = tf.train.CheckpointManager( ckpt, trained_checkpoint_dir, max_to_keep=1) status = ckpt.restore(manager.latest_checkpoint).expect_partial() if input_type not in DETECTION_MODULE_MAP: raise ValueError('Unrecognized `input_type`') if use_side_inputs and input_type != 'image_tensor': raise ValueError('Side inputs supported for image_tensor input type only.') zipped_side_inputs = [] if use_side_inputs: zipped_side_inputs = _combine_side_inputs(side_input_shapes, side_input_types, side_input_names) detection_module = DETECTION_MODULE_MAP[input_type](detection_model, use_side_inputs, list(zipped_side_inputs)) # Getting the concrete function traces the graph and forces variables to # be constructed --- only after this can we save the checkpoint and # saved model. concrete_function = detection_module.__call__.get_concrete_function() status.assert_existing_objects_matched() exported_checkpoint_manager = tf.train.CheckpointManager( ckpt, output_checkpoint_directory, max_to_keep=1) exported_checkpoint_manager.save(checkpoint_number=0) tf.saved_model.save(detection_module, output_saved_model_directory, signatures=concrete_function) config_util.save_pipeline_config(pipeline_config, output_directory)
def create_estimator_and_inputs(run_config, hparams, pipeline_config_path, train_steps=None, sample_1_of_n_eval_examples=1, sample_1_of_n_eval_on_train_examples=1, model_fn_creator=create_model_fn, use_tpu_estimator=False, use_tpu=False, num_shards=1, params=None, override_eval_num_epochs=True, **kwargs): """Creates `Estimator`, input functions, and steps. Args: run_config: A `RunConfig`. hparams: A `HParams`. pipeline_config_path: A path to a pipeline config file. train_steps: Number of training steps. If None, the number of training steps is set from the `TrainConfig` proto. sample_1_of_n_eval_examples: Integer representing how often an eval example should be sampled. If 1, will sample all examples. sample_1_of_n_eval_on_train_examples: Similar to `sample_1_of_n_eval_examples`, except controls the sampling of training data for evaluation. model_fn_creator: A function that creates a `model_fn` for `Estimator`. Follows the signature: * Args: * `detection_model_fn`: Function that returns `DetectionModel` instance. * `configs`: Dictionary of pipeline config objects. * `hparams`: `HParams` object. * Returns: `model_fn` for `Estimator`. use_tpu_estimator: Whether a `TPUEstimator` should be returned. If False, an `Estimator` will be returned. use_tpu: Boolean, whether training and evaluation should run on TPU. Only used if `use_tpu_estimator` is True. num_shards: Number of shards (TPU cores). Only used if `use_tpu_estimator` is True. params: Parameter dictionary passed from the estimator. Only used if `use_tpu_estimator` is True. override_eval_num_epochs: Whether to overwrite the number of epochs to 1 for eval_input. **kwargs: Additional keyword arguments for configuration override. Returns: A dictionary with the following fields: 'estimator': An `Estimator` or `TPUEstimator`. 'train_input_fn': A training input function. 'eval_input_fns': A list of all evaluation input functions. 'eval_input_names': A list of names for each evaluation input. 'eval_on_train_input_fn': An evaluation-on-train input function. 'predict_input_fn': A prediction input function. 'train_steps': Number of training steps. Either directly from input or from configuration. """ get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[ 'get_configs_from_pipeline_file'] merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[ 'merge_external_params_with_configs'] create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[ 'create_pipeline_proto_from_configs'] create_train_input_fn = MODEL_BUILD_UTIL_MAP['create_train_input_fn'] create_eval_input_fn = MODEL_BUILD_UTIL_MAP['create_eval_input_fn'] create_predict_input_fn = MODEL_BUILD_UTIL_MAP['create_predict_input_fn'] configs = get_configs_from_pipeline_file(pipeline_config_path) kwargs.update({ 'train_steps': train_steps, 'sample_1_of_n_eval_examples': sample_1_of_n_eval_examples }) if override_eval_num_epochs: kwargs.update({'eval_num_epochs': 1}) tf.logging.warning( 'Forced number of epochs for all eval validations to be 1.') configs = merge_external_params_with_configs( configs, hparams, kwargs_dict=kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] eval_config = configs['eval_config'] eval_input_configs = configs['eval_input_configs'] eval_on_train_input_config = copy.deepcopy(train_input_config) eval_on_train_input_config.sample_1_of_n_examples = ( sample_1_of_n_eval_on_train_examples) if override_eval_num_epochs and eval_on_train_input_config.num_epochs != 1: tf.logging.warning('Expected number of evaluation epochs is 1, but ' 'instead encountered `eval_on_train_input_config' '.num_epochs` = ' '{}. Overwriting `num_epochs` to 1.'.format( eval_on_train_input_config.num_epochs)) eval_on_train_input_config.num_epochs = 1 # update train_steps from config but only when non-zero value is provided if train_steps is None and train_config.num_steps != 0: train_steps = train_config.num_steps detection_model_fn = functools.partial( model_builder.build, model_config=model_config) # Create the input functions for TRAIN/EVAL/PREDICT. train_input_fn = create_train_input_fn( train_config=train_config, train_input_config=train_input_config, model_config=model_config) eval_input_fns = [ create_eval_input_fn( eval_config=eval_config, eval_input_config=eval_input_config, model_config=model_config) for eval_input_config in eval_input_configs ] eval_input_names = [ eval_input_config.name for eval_input_config in eval_input_configs ] eval_on_train_input_fn = create_eval_input_fn( eval_config=eval_config, eval_input_config=eval_on_train_input_config, model_config=model_config) predict_input_fn = create_predict_input_fn( model_config=model_config, predict_input_config=eval_input_configs[0]) export_to_tpu = hparams.get('export_to_tpu', False) tf.logging.info('create_estimator_and_inputs: use_tpu %s, export_to_tpu %s', use_tpu, export_to_tpu) model_fn = model_fn_creator(detection_model_fn, configs, hparams, use_tpu) if use_tpu_estimator: estimator = tf.contrib.tpu.TPUEstimator( model_fn=model_fn, train_batch_size=train_config.batch_size, # For each core, only batch size 1 is supported for eval. eval_batch_size=num_shards * 1 if use_tpu else 1, use_tpu=use_tpu, config=run_config, # TODO(lzc): Remove conditional after CMLE moves to TF 1.9 params=params if params else {}) else: estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) # Write the as-run pipeline config to disk. if run_config.is_chief: pipeline_config_final = create_pipeline_proto_from_configs(configs) config_util.save_pipeline_config(pipeline_config_final, estimator.model_dir) return dict( estimator=estimator, train_input_fn=train_input_fn, eval_input_fns=eval_input_fns, eval_input_names=eval_input_names, eval_on_train_input_fn=eval_on_train_input_fn, predict_input_fn=predict_input_fn, train_steps=train_steps)
def eval_continuously( pipeline_config_path, config_override=None, train_steps=None, sample_1_of_n_eval_examples=1, sample_1_of_n_eval_on_train_examples=1, use_tpu=False, override_eval_num_epochs=True, postprocess_on_cpu=False, model_dir=None, checkpoint_dir=None, wait_interval=180, timeout=3600, eval_index=0, save_final_config=False, **kwargs): """Run continuous evaluation of a detection model eagerly. This method builds the model, and continously restores it from the most recent training checkpoint in the checkpoint directory & evaluates it on the evaluation data. Args: pipeline_config_path: A path to a pipeline config file. config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to override the config from `pipeline_config_path`. train_steps: Number of training steps. If None, the number of training steps is set from the `TrainConfig` proto. sample_1_of_n_eval_examples: Integer representing how often an eval example should be sampled. If 1, will sample all examples. sample_1_of_n_eval_on_train_examples: Similar to `sample_1_of_n_eval_examples`, except controls the sampling of training data for evaluation. use_tpu: Boolean, whether training and evaluation should run on TPU. override_eval_num_epochs: Whether to overwrite the number of epochs to 1 for eval_input. postprocess_on_cpu: When use_tpu and postprocess_on_cpu are true, postprocess is scheduled on the host cpu. model_dir: Directory to output resulting evaluation summaries to. checkpoint_dir: Directory that contains the training checkpoints. wait_interval: The mimmum number of seconds to wait before checking for a new checkpoint. timeout: The maximum number of seconds to wait for a checkpoint. Execution will terminate if no new checkpoints are found after these many seconds. eval_index: int, If given, only evaluate the dataset at the given index. By default, evaluates dataset at 0'th index. save_final_config: Whether to save the pipeline config file to the model directory. **kwargs: Additional keyword arguments for configuration override. """ get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[ 'get_configs_from_pipeline_file'] create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[ 'create_pipeline_proto_from_configs'] merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[ 'merge_external_params_with_configs'] configs = get_configs_from_pipeline_file( pipeline_config_path, config_override=config_override) kwargs.update({ 'sample_1_of_n_eval_examples': sample_1_of_n_eval_examples, 'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu }) if train_steps is not None: kwargs['train_steps'] = train_steps if override_eval_num_epochs: kwargs.update({'eval_num_epochs': 1}) tf.logging.warning( 'Forced number of epochs for all eval validations to be 1.') configs = merge_external_params_with_configs( configs, None, kwargs_dict=kwargs) if model_dir and save_final_config: tf.logging.info('Saving pipeline config file to directory {}'.format( model_dir)) pipeline_config_final = create_pipeline_proto_from_configs(configs) config_util.save_pipeline_config(pipeline_config_final, model_dir) model_config = configs['model'] train_input_config = configs['train_input_config'] eval_config = configs['eval_config'] eval_input_configs = configs['eval_input_configs'] eval_on_train_input_config = copy.deepcopy(train_input_config) eval_on_train_input_config.sample_1_of_n_examples = ( sample_1_of_n_eval_on_train_examples) if override_eval_num_epochs and eval_on_train_input_config.num_epochs != 1: tf.logging.warning('Expected number of evaluation epochs is 1, but ' 'instead encountered `eval_on_train_input_config' '.num_epochs` = ' '{}. Overwriting `num_epochs` to 1.'.format( eval_on_train_input_config.num_epochs)) eval_on_train_input_config.num_epochs = 1 if kwargs['use_bfloat16']: tf.compat.v2.keras.mixed_precision.experimental.set_policy('mixed_bfloat16') eval_input_config = eval_input_configs[eval_index] strategy = tf.compat.v2.distribute.get_strategy() with strategy.scope(): detection_model = MODEL_BUILD_UTIL_MAP['detection_model_fn_base']( model_config=model_config, is_training=True) eval_input = strategy.experimental_distribute_dataset( inputs.eval_input( eval_config=eval_config, eval_input_config=eval_input_config, model_config=model_config, model=detection_model)) global_step = tf.compat.v2.Variable( 0, trainable=False, dtype=tf.compat.v2.dtypes.int64) optimizer, _ = optimizer_builder.build( configs['train_config'].optimizer, global_step=global_step) for latest_checkpoint in tf.train.checkpoints_iterator( checkpoint_dir, timeout=timeout, min_interval_secs=wait_interval): ckpt = tf.compat.v2.train.Checkpoint( step=global_step, model=detection_model, optimizer=optimizer) # We run the detection_model on dummy inputs in order to ensure that the # model and all its variables have been properly constructed. Specifically, # this is currently necessary prior to (potentially) creating shadow copies # of the model variables for the EMA optimizer. if eval_config.use_moving_averages: unpad_groundtruth_tensors = (eval_config.batch_size == 1 and not use_tpu) _ensure_model_is_built(detection_model, eval_input, unpad_groundtruth_tensors) optimizer.shadow_copy(detection_model) ckpt.restore(latest_checkpoint).expect_partial() if eval_config.use_moving_averages: optimizer.swap_weights() summary_writer = tf.compat.v2.summary.create_file_writer( os.path.join(model_dir, 'eval', eval_input_config.name)) with summary_writer.as_default(): eval_metrics = eager_eval_loop( detection_model, configs, eval_input, use_tpu=use_tpu, postprocess_on_cpu=postprocess_on_cpu, global_step=global_step, ) return eval_metrics
def train_loop(hparams, pipeline_config_path, model_dir, config_override=None, train_steps=None, use_tpu=False, save_final_config=False, export_to_tpu=None, checkpoint_every_n=1000, checkpoint_max_to_keep=7, **kwargs): """Trains a model using eager + functions. This method: 1. Processes the pipeline configs 2. (Optionally) saves the as-run config 3. Builds the model & optimizer 4. Gets the training input data 5. Loads a fine-tuning detection or classification checkpoint if requested 6. Loops over the train data, executing distributed training steps inside tf.functions. 7. Checkpoints the model every `checkpoint_every_n` training steps. 8. Logs the training metrics as TensorBoard summaries. Args: hparams: A `HParams`. pipeline_config_path: A path to a pipeline config file. model_dir: The directory to save checkpoints and summaries to. config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to override the config from `pipeline_config_path`. train_steps: Number of training steps. If None, the number of training steps is set from the `TrainConfig` proto. use_tpu: Boolean, whether training and evaluation should run on TPU. save_final_config: Whether to save final config (obtained after applying overrides) to `model_dir`. export_to_tpu: When use_tpu and export_to_tpu are true, `export_savedmodel()` exports a metagraph for serving on TPU besides the one on CPU. If export_to_tpu is not provided, we will look for it in hparams too. checkpoint_every_n: Checkpoint every n training steps. checkpoint_max_to_keep: int, the number of most recent checkpoints to keep in the model directory. **kwargs: Additional keyword arguments for configuration override. """ ## Parse the configs get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[ 'get_configs_from_pipeline_file'] merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[ 'merge_external_params_with_configs'] create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[ 'create_pipeline_proto_from_configs'] configs = get_configs_from_pipeline_file(pipeline_config_path, config_override=config_override) kwargs.update({ 'train_steps': train_steps, 'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu }) configs = merge_external_params_with_configs(configs, hparams, kwargs_dict=kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors add_regularization_loss = train_config.add_regularization_loss clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm # update train_steps from config but only when non-zero value is provided if train_steps is None and train_config.num_steps != 0: train_steps = train_config.num_steps # Read export_to_tpu from hparams if not passed. if export_to_tpu is None: export_to_tpu = hparams.get('export_to_tpu', False) tf.logging.info('train_loop: use_tpu %s, export_to_tpu %s', use_tpu, export_to_tpu) if kwargs['use_bfloat16']: tf.compat.v2.keras.mixed_precision.experimental.set_policy( 'mixed_bfloat16') # Parse the checkpoint fine tuning configs if hparams.load_pretrained: fine_tune_checkpoint_path = train_config.fine_tune_checkpoint else: fine_tune_checkpoint_path = None load_all_detection_checkpoint_vars = ( train_config.load_all_detection_checkpoint_vars) # TODO(kaftan) (or anyone else): move this piece of config munging to ## utils/config_util.py if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, set train_config.fine_tune_checkpoint_type # based on train_config.from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type fine_tune_checkpoint_version = train_config.fine_tune_checkpoint_version # Write the as-run pipeline config to disk. if save_final_config: pipeline_config_final = create_pipeline_proto_from_configs(configs) config_util.save_pipeline_config(pipeline_config_final, model_dir) # Build the model, optimizer, and training input strategy = tf.compat.v2.distribute.get_strategy() with strategy.scope(): detection_model = model_builder.build(model_config=model_config, is_training=True) def train_dataset_fn(input_context): """Callable to create train input.""" # Create the inputs. train_input = inputs.train_input( train_config=train_config, train_input_config=train_input_config, model_config=model_config, model=detection_model, input_context=input_context) train_input = train_input.repeat() return train_input train_input = strategy.experimental_distribute_datasets_from_function( train_dataset_fn) global_step = tf.Variable( 0, trainable=False, dtype=tf.compat.v2.dtypes.int64, name='global_step', aggregation=tf.compat.v2.VariableAggregation.ONLY_FIRST_REPLICA) optimizer, (learning_rate, ) = optimizer_builder.build( train_config.optimizer, global_step=global_step) if callable(learning_rate): learning_rate_fn = learning_rate else: learning_rate_fn = lambda: learning_rate ## Train the model # Get the appropriate filepath (temporary or not) based on whether the worker # is the chief. summary_writer_filepath = _get_filepath(strategy, os.path.join(model_dir, 'train')) summary_writer = tf.compat.v2.summary.create_file_writer( summary_writer_filepath) if use_tpu: num_steps_per_iteration = 100 else: # TODO(b/135933080) Explore setting to 100 when GPU performance issues # are fixed. num_steps_per_iteration = 1 with summary_writer.as_default(): with strategy.scope(): with tf.compat.v2.summary.record_if( lambda: global_step % num_steps_per_iteration == 0): # Load a fine-tuning checkpoint. if fine_tune_checkpoint_path: load_fine_tune_checkpoint( detection_model, fine_tune_checkpoint_path, fine_tune_checkpoint_type, fine_tune_checkpoint_version, load_all_detection_checkpoint_vars, train_input, unpad_groundtruth_tensors) ckpt = tf.compat.v2.train.Checkpoint(step=global_step, model=detection_model, optimizer=optimizer) manager_dir = _get_filepath(strategy, model_dir) if not strategy.extended.should_checkpoint: checkpoint_max_to_keep = 1 manager = tf.compat.v2.train.CheckpointManager( ckpt, manager_dir, max_to_keep=checkpoint_max_to_keep) # We use the following instead of manager.latest_checkpoint because # manager_dir does not point to the model directory when we are running # in a worker. latest_checkpoint = tf.train.latest_checkpoint(model_dir) ckpt.restore(latest_checkpoint) def train_step_fn(features, labels): """Single train step.""" loss = eager_train_step( detection_model, features, labels, unpad_groundtruth_tensors, optimizer, learning_rate=learning_rate_fn(), add_regularization_loss=add_regularization_loss, clip_gradients_value=clip_gradients_value, global_step=global_step, num_replicas=strategy.num_replicas_in_sync) global_step.assign_add(1) return loss def _sample_and_train(strategy, train_step_fn, data_iterator): features, labels = data_iterator.next() per_replica_losses = strategy.run(train_step_fn, args=(features, labels)) # TODO(anjalisridhar): explore if it is safe to remove the ## num_replicas scaling of the loss and switch this to a ReduceOp.Mean return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) @tf.function def _dist_train_step(data_iterator): """A distributed train step.""" if num_steps_per_iteration > 1: for _ in tf.range(num_steps_per_iteration - 1): _sample_and_train(strategy, train_step_fn, data_iterator) return _sample_and_train(strategy, train_step_fn, data_iterator) train_input_iter = iter(train_input) checkpointed_step = int(global_step.value()) logged_step = global_step.value() last_step_time = time.time() for _ in range(global_step.value(), train_steps, num_steps_per_iteration): loss = _dist_train_step(train_input_iter) time_taken = time.time() - last_step_time last_step_time = time.time() tf.compat.v2.summary.scalar('steps_per_sec', num_steps_per_iteration * 1.0 / time_taken, step=global_step) if global_step.value() - logged_step >= 100: tf.logging.info( 'Step {} per-step time {:.3f}s loss={:.3f}'.format( global_step.value(), time_taken / num_steps_per_iteration, loss)) logged_step = global_step.value() if ((int(global_step.value()) - checkpointed_step) >= checkpoint_every_n): manager.save() checkpointed_step = int(global_step.value()) # Remove the checkpoint directories of the non-chief workers that # MultiWorkerMirroredStrategy forces us to save during sync distributed # training. _clean_temporary_directories(strategy, manager_dir) _clean_temporary_directories(strategy, summary_writer_filepath)
def edit_config(model_selected, config_output_dir, num_steps, label_map_path, record_dir, eval_number, annotation_type, batch_size=None, learning_rate=None, resizer_size=None): ''' Wrapper to edit the essential values inside the base configuration protobuf file provided with an object-detection/segmentation checkpoint. This configuration file is what will entirely define your model, pre-processing, training, evaluation etc. It is the most important file of a model with the checkpoint file and should never be deleted. This is why it is saved in almost every directory where you did something to keep redondancy but also to be sure to have the right config file used at this moment. For advanced users, if you want to dwell deep inside the configuration file you should read the proto definitions inside the proto directory of the object-detection API. Args: Required: model_selected: The checkpoint you want to resume from. config_output_dir: The path where you want to save your edited protobuf configuration file. num_steps: The number of steps you want to train on. label_map_path: The path to your label_map.pbtxt file. record_dir: The path to the directory where your TFRecord files are saved. eval_number: The number of images you want to evaluate on. annotation_type: Should be either "rectangle" or "polygon", depending on how you annotated your images. Optional: batch_size: The batch size you want to use. If not provided it will use the previous one. learning_rate: The learning rate you want to use for the training. If not provided it will use the previous one. Please see config_utils.update_initial_learning_rate() inside the object_detection folder for indepth details on what happens when updating it. resizer_size: The shape used to update your image resizer. Please see set_image_resizer() for more details on this. If not provided it will use the previous one. ''' file_list = os.listdir(model_selected) ckpt_ids = [] for p in file_list: if "index" in p: if "-" in p: ckpt_ids.append(int(p.split('-')[1].split('.')[0])) if len(ckpt_ids) > 0: ckpt_path = os.path.join(model_selected, "model.ckpt-{}".format(str(max(ckpt_ids)))) else: ckpt_path = os.path.join(model_selected, "model.ckpt") configs = config_util.get_configs_from_pipeline_file( os.path.join(model_selected, 'pipeline.config')) label_map = label_map_util.load_labelmap(label_map_path) config_util._update_train_steps(configs, num_steps) update_different_paths( configs, ckpt_path=ckpt_path, label_map_path=label_map_path, train_record_path=os.path.join(record_dir, "train.record"), eval_record_path=os.path.join(record_dir, "eval.record")) if learning_rate is not None: config_util._update_initial_learning_rate(configs, learning_rate) if batch_size is not None: config_util._update_batch_size(configs, batch_size) if annotation_type == "polygon": edit_masks(configs, mask_type="PNG_MASKS") if resizer_size is not None: set_image_resizer(configs, resizer_size) edit_eval_config(configs, annotation_type, eval_number) update_num_classes(configs, label_map) config_proto = config_util.create_pipeline_proto_from_configs(configs) config_util.save_pipeline_config(config_proto, directory=config_output_dir)
args = parser.parse_args() ds_info = load_data_set_path_dict()[args.dataset] if ds_info['data_set_type'] != 'object_detection': assert ('Dataset TypeError: Select a dataset for object detection') return args if __name__ == '__main__': args = parse_args() # Defining the output_path that the pipeline config will be written to pipeline_out_path = os.path.join(args.exp_dir, 'pipeline.config') dataset_dir = os.path.join(os.environ['DCNN_DATASETS_PATH'], args.dataset) params_proto = config_odm_run( pipline_config_path=args.pipeline_config_path, dataset_path=dataset_dir, fine_tune_dir=args.fine_tune_dir) save_pipeline_config(params_proto, args.exp_dir) print('-' * 50) print('Beginning Training, logging to {}'.format(args.exp_dir)) train_odm(model_dir=args.exp_dir, pipeline_config_path=pipeline_out_path, num_train_steps=args.num_train_steps, num_eval_steps=args.num_eval_steps, hparams=None) print('-' * 50)
def train_loop(hparams, pipeline_config_path, model_dir, config_override=None, train_steps=None, use_tpu=False, save_final_config=False, export_to_tpu=None, checkpoint_every_n=1000, **kwargs): """Trains a model using eager + functions. This method: 1. Processes the pipeline configs 2. (Optionally) saves the as-run config 3. Builds the model & optimizer 4. Gets the training input data 5. Loads a fine-tuning detection or classification checkpoint if requested 6. Loops over the train data, executing distributed training steps inside tf.functions. 7. Checkpoints the model every `checkpoint_every_n` training steps. 8. Logs the training metrics as TensorBoard summaries. Args: hparams: A `HParams`. pipeline_config_path: A path to a pipeline config file. model_dir: The directory to save checkpoints and summaries to. config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to override the config from `pipeline_config_path`. train_steps: Number of training steps. If None, the number of training steps is set from the `TrainConfig` proto. use_tpu: Boolean, whether training and evaluation should run on TPU. save_final_config: Whether to save final config (obtained after applying overrides) to `model_dir`. export_to_tpu: When use_tpu and export_to_tpu are true, `export_savedmodel()` exports a metagraph for serving on TPU besides the one on CPU. If export_to_tpu is not provided, we will look for it in hparams too. checkpoint_every_n: Checkpoint every n training steps. **kwargs: Additional keyword arguments for configuration override. """ ## Parse the configs get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[ 'get_configs_from_pipeline_file'] merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[ 'merge_external_params_with_configs'] create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[ 'create_pipeline_proto_from_configs'] configs = get_configs_from_pipeline_file(pipeline_config_path, config_override=config_override) kwargs.update({ 'train_steps': train_steps, 'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu }) configs = merge_external_params_with_configs(configs, hparams, kwargs_dict=kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors add_regularization_loss = train_config.add_regularization_loss clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm # update train_steps from config but only when non-zero value is provided if train_steps is None and train_config.num_steps != 0: train_steps = train_config.num_steps # Read export_to_tpu from hparams if not passed. if export_to_tpu is None: export_to_tpu = hparams.get('export_to_tpu', False) tf.logging.info('train_loop: use_tpu %s, export_to_tpu %s', use_tpu, export_to_tpu) if kwargs['use_bfloat16']: tf.compat.v2.keras.mixed_precision.experimental.set_policy( 'mixed_bfloat16') # Parse the checkpoint fine tuning configs if hparams.load_pretrained: fine_tune_checkpoint_path = train_config.fine_tune_checkpoint else: fine_tune_checkpoint_path = None load_all_detection_checkpoint_vars = ( train_config.load_all_detection_checkpoint_vars) # TODO(kaftan) (or anyone else): move this piece of config munging to ## utils/config_util.py if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, set train_config.fine_tune_checkpoint_type # based on train_config.from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type # Write the as-run pipeline config to disk. if save_final_config: pipeline_config_final = create_pipeline_proto_from_configs(configs) config_util.save_pipeline_config(pipeline_config_final, model_dir) # Build the model, optimizer, and training input strategy = tf.compat.v2.distribute.get_strategy() with strategy.scope(): detection_model = model_builder.build(model_config=model_config, is_training=True) # Create the inputs. train_input = inputs.train_input(train_config=train_config, train_input_config=train_input_config, model_config=model_config, model=detection_model) train_input = strategy.experimental_distribute_dataset( train_input.repeat()) global_step = tf.compat.v2.Variable(0, trainable=False, dtype=tf.compat.v2.dtypes.int64, name='global_step') optimizer, (learning_rate, ) = optimizer_builder.build( train_config.optimizer, global_step=global_step) if callable(learning_rate): learning_rate_fn = learning_rate else: learning_rate_fn = lambda: learning_rate ## Train the model summary_writer = tf.compat.v2.summary.create_file_writer(model_dir + '/train') with summary_writer.as_default(): with strategy.scope(): # Load a fine-tuning checkpoint. if fine_tune_checkpoint_path: load_fine_tune_checkpoint(detection_model, fine_tune_checkpoint_path, fine_tune_checkpoint_type, load_all_detection_checkpoint_vars, train_input, unpad_groundtruth_tensors) ckpt = tf.compat.v2.train.Checkpoint(step=global_step, model=detection_model) manager = tf.compat.v2.train.CheckpointManager(ckpt, model_dir, max_to_keep=7) ## Maybe re-enable checkpoint restoration depending on how it works: # ckpt.restore(manager.latest_checkpoint) def train_step_fn(features, labels): return eager_train_step( detection_model, features, labels, unpad_groundtruth_tensors, optimizer, learning_rate=learning_rate_fn(), add_regularization_loss=add_regularization_loss, clip_gradients_value=clip_gradients_value, use_tpu=use_tpu, global_step=global_step, num_replicas=strategy.num_replicas_in_sync) @tf.function def _dist_train_step(data_iterator): """A distributed train step.""" features, labels = data_iterator.next() per_replica_losses = strategy.experimental_run_v2( train_step_fn, args=( features, labels, )) # TODO(anjalisridhar): explore if it is safe to remove the ## num_replicas scaling of the loss and switch this to a ReduceOp.Mean mean_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) return mean_loss train_input_iter = iter(train_input) for _ in range(train_steps): start_time = time.time() loss = _dist_train_step(train_input_iter) global_step.assign_add(1) end_time = time.time() if not use_tpu: tf.compat.v2.summary.scalar('steps_per_sec', 1.0 / (end_time - start_time), step=global_step) # TODO(kaftan): Remove this print after it is no longer helpful for ## debugging. print('Finished step', global_step, end_time, loss) if int(global_step.value().numpy()) % checkpoint_every_n == 0: manager.save()
def train_loop(config_path: str, model_dir: str, config_override: Optional[ pipeline_pb2.TrainEvalPipelineConfig] = None, train_steps: Optional[int] = None, use_tpu: bool = False, save_final_config: bool = False, log_every_n: int = 100, ckpt_every_n: int = 1000, ckpt_max_to_keep: int = 7, record_summaries: bool = True, **kwargs) -> None: """Trains a model using eager + functions. This method: 1. Processes the pipeline configs 2. (Optionally) saves the as-run config 3. Builds the model & optimizer 4. Gets the training input data 5. Loads a fine-tuning detection or classification checkpoint if requested 6. Loops over the train data, executing distributed training steps inside tf.functions. 7. Checkpoints the model every `ckpt_every_n` training steps. 8. Logs the training metrics as TensorBoard summaries. Args: config_path: A path to a pipeline config file. model_dir: The directory to save checkpoints and summaries to. config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to override the config from `config_path`. train_steps: Number of training steps. If None, training steps from `TrainConfig` proto will be adopted. use_tpu: Boolean, whether training and evaluation should run on TPU. save_final_config: Whether to save final config (obtained after applying overrides) to `model_dir`. log_every_n: Log total loss every n training steps. ckpt_every_n: Checkpoint every n training steps. ckpt_max_to_keep: int, the number of most recent checkpoints to keep in the model directory. record_summaries: Boolean, whether or not to record summaries. **kwargs: Additional keyword arguments for configuration override. """ # parse config configs = config_util.get_configs_from_pipeline_file( config_path, config_override=config_override) kwargs.update({ 'train_steps': train_steps, 'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu, }) configs = config_util.merge_external_params_with_configs( configs, None, kwargs_dict=kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] unpad_gt_tensors = train_config.unpad_groundtruth_tensors add_regularization_loss = train_config.add_regularization_loss clip_gradient_norm = None if train_config.gradient_clipping_by_norm > 0: clip_gradient_norm = train_config.gradient_clipping_by_norm if kwargs['use_bfloat16']: tf.keras.mixed_precision.experimental.set_policy('mixed_bfloat16') if train_config.load_all_detection_checkpoint_vars: raise ValueError( 'train_pb2.load_all_detection_checkpoint_vars unsupported in TF2') # base checkpoint to fine-tune from config_util.update_fine_tune_checkpoint_type(train_config) base_ckpt = train_config.fine_tune_checkpoint base_ckpt_type = train_config.fine_tune_checkpoint_type base_ckpt_ver = train_config.fine_tune_checkpoint_version # write the as-run pipeline config to disk if save_final_config: pipeline_config_final = config_util.create_pipeline_proto_from_configs( configs) config_util.save_pipeline_config(pipeline_config_final, model_dir) # build model, input, optimizer strategy = tf.distribute.get_strategy() with strategy.scope(): # build model model = model_builder.build(model_config=model_config, is_training=True) # build input def train_dataset_fn( input_context: tf.distribute.InputContext) -> tf.data.Dataset: """Callable to create train input.""" train_input = inputs.train_input( train_config=train_config, train_input_config=train_input_config, model_config=model_config, model=model, input_context=input_context, ) train_input = train_input.repeat() return train_input train_input = strategy.experimental_distribute_datasets_from_function( train_dataset_fn) # build optimizer global_step = tf.Variable( 0, trainable=False, dtype=tf.int64, name='global_step', aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA, ) optimizer, (learning_rate, ) = optimizer_builder.build( train_config.optimizer, global_step=global_step) if callable(learning_rate): learning_rate_fn = learning_rate else: learning_rate_fn = lambda: learning_rate # prepare for training # get appropriate filepath (temporary or not) based on whether the worker is the chief summary_log_path = get_filepath(strategy, os.path.join(model_dir, 'train')) if record_summaries: summary_writer = tf.summary.create_file_writer(summary_log_path) else: summary_writer = tf.summary.create_noop_writer() if use_tpu: num_steps_per_iteration = 100 else: num_steps_per_iteration = 1 with summary_writer.as_default(): with strategy.scope(): with tf.summary.record_if( lambda: global_step % num_steps_per_iteration == 0): # prepare checkpoint manager # (do not use manager.latest_checkpoint as manager_dir is not model_dir while running in worker) ckpt = tf.train.Checkpoint(model=model, step=global_step, optimizer=optimizer) ckpt_max_to_keep = ckpt_max_to_keep if strategy.extended.should_checkpoint else 1 manager_dir = get_filepath(strategy, model_dir) manager = tf.train.CheckpointManager( ckpt, manager_dir, max_to_keep=ckpt_max_to_keep) latest_ckpt = tf.train.latest_checkpoint(model_dir) if latest_ckpt: # load latest checkpoint being trained ckpt.restore(latest_ckpt).expect_partial() elif base_ckpt: # load a pre-trained checkpoint load_base_ckpt(model, base_ckpt, base_ckpt_type, base_ckpt_ver, train_input, unpad_gt_tensors) # get trainable variables train_vars = get_train_vars(model, train_config) # define training step def train_step_fn(features: Dict, labels: Dict): """Single train step.""" loss = eager_train_step( model, train_vars, features, labels, unpad_gt_tensors, optimizer, learning_rate=learning_rate_fn(), add_regularization_loss=add_regularization_loss, clip_gradient_norm=clip_gradient_norm, global_step=global_step, num_replicas=strategy.num_replicas_in_sync, ) global_step.assign_add(1) return loss def _sample_and_train(strategy, train_step_fn, data_iterator): features, labels = data_iterator.next() per_replica_losses = strategy.run(train_step_fn, args=(features, labels)) return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) @tf.function def _dist_train_step(data_iterator): """A distributed train step.""" if num_steps_per_iteration > 1: for _ in tf.range(num_steps_per_iteration - 1): with tf.name_scope(''): _sample_and_train(strategy, train_step_fn, data_iterator) return _sample_and_train(strategy, train_step_fn, data_iterator) train_input_iter = iter(train_input) # save initialized version of checkpoint if int(global_step.value()) == 0: manager.save() ckpt_step = int(global_step.value()) logged_step = global_step.value() # proceed with training last_step_time = time.time() for _ in range(global_step.value(), train_config.num_steps, num_steps_per_iteration): # execute a step (forward pass + backward pass) loss = _dist_train_step(train_input_iter) # log time curr_step = global_step.value() time_taken = time.time() - last_step_time last_step_time = time.time() tf.summary.scalar( 'steps_per_sec', num_steps_per_iteration * 1.0 / time_taken, step=global_step, ) # log loss if curr_step - logged_step >= log_every_n: step_time = time_taken / num_steps_per_iteration step_msg = 'Step {} per-step time {:.3f}s loss={:.3f}'.format( curr_step, step_time, loss) v1.logging.info(step_msg) logged_step = curr_step # save checkpoint regularly if (curr_step - ckpt_step) >= ckpt_every_n: manager.save() ckpt_step = curr_step # remove checkpoint directories of non-chief workers that MultiWorkerMirroredStrategy forces us to save during sync # distributed training. clean_temporary_directories(strategy, manager_dir) clean_temporary_directories(strategy, summary_log_path)
def set_config(config_path: Union[str, Path], checkpoint_path: Union[str, Path], tf_records_train_path: Union[str, Path], label_map: Dict[str, int], label_map_filepath: Union[str, Path], batch_size: int, max_box_predictions: int, max_number_of_boxes: int, fine_tune_checkpoint_type: str = 'detection', augment_path: str = None, min_dimension: int = None, max_dimension: int = None, total_steps: int = None, warmup_steps: int = None, num_steps: int = None): logger.info(f"Set configs {config_path}...") configs = get_configs_from_pipeline_file(str(config_path)) train_len = count_tfrecord_examples(str(tf_records_train_path)) logger.info(f"Train has {train_len} tf_records.") num_classes = len(set(label_map.values())) _, config_model = configs['model'].ListFields()[0] config_model.num_classes = num_classes configs[ 'model'].center_net.object_center_params.max_box_predictions = max_box_predictions if min_dimension is not None: configs[ 'model'].center_net.image_resizer.keep_aspect_ratio_resizer.min_dimension = min_dimension if max_dimension is not None: configs[ 'model'].center_net.image_resizer.keep_aspect_ratio_resizer.max_dimension = max_dimension configs[ 'train_config'].fine_tune_checkpoint_type = fine_tune_checkpoint_type configs['train_config'].fine_tune_checkpoint = str(checkpoint_path) configs['train_config'].batch_size = batch_size configs['train_config'].max_number_of_boxes = max_number_of_boxes if total_steps is not None: configs[ 'train_config'].optimizer.adam_optimizer.learning_rate.cosine_decay_learning_rate.total_steps = total_steps if warmup_steps is not None: configs[ 'train_config'].optimizer.adam_optimizer.learning_rate.cosine_decay_learning_rate.warmup_steps = warmup_steps if num_steps is not None: configs['train_config'].num_steps = num_steps if augment_path is not None: augment_config = configs['train_config'].data_augmentation_options for _ in augment_config: augment_config.pop() augment = text_format.Merge(str(augment_path), pipeline_pb2.TrainEvalPipelineConfig()) augment_config.extend(augment.train_config.data_augmentation_options) label_map_to_file(label_map=label_map, filepath=label_map_filepath) def clear_repeated_proto(proto): for _ in proto: proto.pop() configs['train_input_config'].label_map_path = str(label_map_filepath) clear_repeated_proto( configs['train_input_config'].tf_record_input_reader.input_path) configs['train_input_config'].tf_record_input_reader.input_path.append( str(tf_records_train_path)) pipeline_proto = create_pipeline_proto_from_configs(configs) save_pipeline_config(pipeline_proto, str(config_path.parent)) logger.info(f"Config {config_path} changed")
def train_loop(hparams, pipeline_config_path, model_dir, config_override=None, train_steps=None, use_tpu=False, save_final_config=False, export_to_tpu=None, checkpoint_every_n=1000, **kwargs): ## Parse the configs get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[ 'get_configs_from_pipeline_file'] merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[ 'merge_external_params_with_configs'] create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[ 'create_pipeline_proto_from_configs'] configs = get_configs_from_pipeline_file(pipeline_config_path, config_override=config_override) kwargs.update({ 'train_steps': train_steps, 'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu }) configs = merge_external_params_with_configs(configs, hparams, kwargs_dict=kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] unpad_groundtruth_tensors = train_config.unpad_groundtruth_tensors use_bfloat16 = train_config.use_bfloat16 add_regularization_loss = train_config.add_regularization_loss clip_gradients_value = None if train_config.gradient_clipping_by_norm > 0: clip_gradients_value = train_config.gradient_clipping_by_norm # update train_steps from config but only when non-zero value is provided if train_steps is None and train_config.num_steps != 0: train_steps = train_config.num_steps # Read export_to_tpu from hparams if not passed. if export_to_tpu is None: export_to_tpu = hparams.get('export_to_tpu', False) tf.logging.info('train_loop: use_tpu %s, export_to_tpu %s', use_tpu, export_to_tpu) # Parse the checkpoint fine tuning configs if hparams.load_pretrained: fine_tune_checkpoint_path = train_config.fine_tune_checkpoint else: fine_tune_checkpoint_path = None load_all_detection_checkpoint_vars = ( train_config.load_all_detection_checkpoint_vars) # TODO(kaftan) (or anyone else): move this piece of config munging to ## utils/config_util.py if not train_config.fine_tune_checkpoint_type: # train_config.from_detection_checkpoint field is deprecated. For # backward compatibility, set train_config.fine_tune_checkpoint_type # based on train_config.from_detection_checkpoint. if train_config.from_detection_checkpoint: train_config.fine_tune_checkpoint_type = 'detection' else: train_config.fine_tune_checkpoint_type = 'classification' fine_tune_checkpoint_type = train_config.fine_tune_checkpoint_type # Write the as-run pipeline config to disk. if save_final_config: pipeline_config_final = create_pipeline_proto_from_configs(configs) config_util.save_pipeline_config(pipeline_config_final, model_dir) # TODO(kaftan): Either make strategy a parameter of this method, or ## grab it w/ Distribution strategy's get_scope # Build the model, optimizer, and training input strategy = tf.compat.v2.distribute.MirroredStrategy() with strategy.scope(): detection_model = model_builder.build(model_config=model_config, is_training=True) # Create the inputs. train_input = inputs.train_input(train_config=train_config, train_input_config=train_input_config, model_config=model_config, model=detection_model) train_input = strategy.experimental_distribute_dataset( train_input.repeat()) global_step = tf.compat.v2.Variable(0, trainable=False, dtype=tf.compat.v2.dtypes.int64) optimizer, (learning_rate, ) = optimizer_builder.build( train_config.optimizer, global_step=global_step) if callable(learning_rate): learning_rate_fn = learning_rate else: learning_rate_fn = lambda: learning_rate ## Train the model summary_writer = tf.compat.v2.summary.create_file_writer(model_dir + '/train') with summary_writer.as_default(): with strategy.scope(): # Load a fine-tuning checkpoint. if fine_tune_checkpoint_path: load_fine_tune_checkpoint( detection_model, fine_tune_checkpoint_path, fine_tune_checkpoint_type, load_all_detection_checkpoint_vars, train_input, unpad_groundtruth_tensors, use_tpu, use_bfloat16) ckpt = tf.compat.v2.train.Checkpoint(step=global_step, model=detection_model) manager = tf.compat.v2.train.CheckpointManager(ckpt, model_dir, max_to_keep=7) ## Maybe re-enable checkpoint restoration depending on how it works: # ckpt.restore(manager.latest_checkpoint) def train_step_fn(features, labels): return eager_train_step( detection_model, features, labels, unpad_groundtruth_tensors, optimizer, learning_rate=learning_rate_fn(), use_bfloat16=use_bfloat16, add_regularization_loss=add_regularization_loss, clip_gradients_value=clip_gradients_value, use_tpu=use_tpu, global_step=global_step, num_replicas=strategy.num_replicas_in_sync) @tf.function def _dist_train_step(data_iterator): """A distributed train step.""" features, labels = data_iterator.next() per_replica_losses = strategy.experimental_run_v2( train_step_fn, args=( features, labels, )) # TODO(anjalisridhar): explore if it is safe to remove the ## num_replicas scaling of the loss and switch this to a ReduceOp.Mean mean_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) return mean_loss train_input_iter = iter(train_input) for _ in range(train_steps): start_time = time.time() loss = _dist_train_step(train_input_iter) global_step.assign_add(1) end_time = time.time() tf.compat.v2.summary.scalar('steps_per_sec', 1.0 / (end_time - start_time), step=global_step) # TODO(kaftan): Remove this print after it is no longer helpful for ## debugging. tf.print('Finished step', global_step, end_time, loss) if int(global_step.value().numpy()) % checkpoint_every_n == 0: manager.save()
def save_pipeline(pipeline_dict, out_folder): pipeline_proto = create_pipeline_proto_from_configs(pipeline_dict) save_pipeline_config(pipeline_proto, out_folder)
def create_estimator_and_inputs(run_config, hparams, pipeline_config_path, train_steps=None, eval_steps=None, model_fn_creator=create_model_fn, use_tpu_estimator=False, use_tpu=False, num_shards=1, params=None, **kwargs): """Creates `Estimator`, input functions, and steps. Args: run_config: A `RunConfig`. hparams: A `HParams`. pipeline_config_path: A path to a pipeline config file. train_steps: Number of training steps. If None, the number of training steps is set from the `TrainConfig` proto. eval_steps: Number of evaluation steps per evaluation cycle. If None, the number of evaluation steps is set from the `EvalConfig` proto. model_fn_creator: A function that creates a `model_fn` for `Estimator`. Follows the signature: * Args: * `detection_model_fn`: Function that returns `DetectionModel` instance. * `configs`: Dictionary of pipeline config objects. * `hparams`: `HParams` object. * Returns: `model_fn` for `Estimator`. use_tpu_estimator: Whether a `TPUEstimator` should be returned. If False, an `Estimator` will be returned. use_tpu: Boolean, whether training and evaluation should run on TPU. Only used if `use_tpu_estimator` is True. num_shards: Number of shards (TPU cores). Only used if `use_tpu_estimator` is True. params: Parameter dictionary passed from the estimator. Only used if `use_tpu_estimator` is True. **kwargs: Additional keyword arguments for configuration override. Returns: A dictionary with the following fields: 'estimator': An `Estimator` or `TPUEstimator`. 'train_input_fn': A training input function. 'eval_input_fn': An evaluation input function. 'eval_on_train_input_fn': An evaluation-on-train input function. 'predict_input_fn': A prediction input function. 'train_steps': Number of training steps. Either directly from input or from configuration. 'eval_steps': Number of evaluation steps. Either directly from input or from configuration. """ get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[ 'get_configs_from_pipeline_file'] merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[ 'merge_external_params_with_configs'] create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[ 'create_pipeline_proto_from_configs'] create_train_input_fn = MODEL_BUILD_UTIL_MAP['create_train_input_fn'] create_eval_input_fn = MODEL_BUILD_UTIL_MAP['create_eval_input_fn'] create_predict_input_fn = MODEL_BUILD_UTIL_MAP['create_predict_input_fn'] configs = get_configs_from_pipeline_file(pipeline_config_path) configs = merge_external_params_with_configs( configs, hparams, train_steps=train_steps, eval_steps=eval_steps, **kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] eval_config = configs['eval_config'] eval_input_config = configs['eval_input_config'] if train_steps is None: train_steps = configs['train_config'].num_steps if eval_steps is None: eval_steps = configs['eval_config'].num_examples detection_model_fn = functools.partial( model_builder.build, model_config=model_config) # Create the input functions for TRAIN/EVAL/PREDICT. train_input_fn = create_train_input_fn( train_config=train_config, train_input_config=train_input_config, model_config=model_config) eval_input_fn = create_eval_input_fn( eval_config=eval_config, eval_input_config=eval_input_config, model_config=model_config) eval_on_train_input_fn = create_eval_input_fn( eval_config=eval_config, eval_input_config=train_input_config, model_config=model_config) predict_input_fn = create_predict_input_fn(model_config=model_config) model_fn = model_fn_creator(detection_model_fn, configs, hparams, use_tpu) if use_tpu_estimator: estimator = tf.contrib.tpu.TPUEstimator( model_fn=model_fn, train_batch_size=train_config.batch_size, # For each core, only batch size 1 is supported for eval. eval_batch_size=num_shards * 1 if use_tpu else 1, use_tpu=use_tpu, config=run_config, params=params if params else {}) else: estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) # Write the as-run pipeline config to disk. if run_config.is_chief: pipeline_config_final = create_pipeline_proto_from_configs( configs) config_util.save_pipeline_config(pipeline_config_final, estimator.model_dir) return dict( estimator=estimator, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, eval_on_train_input_fn=eval_on_train_input_fn, predict_input_fn=predict_input_fn, train_steps=train_steps, eval_steps=eval_steps)
print('Building model and restoring weights for fine-tuning...', flush=True) pipeline_config = 'models/research/object_detection/configs/tf2/ssd_mobilenet_v2_fpnlite_320x320_coco17_tpu-8.config' checkpoint_path = 'models/research/object_detection/test_data/checkpoint/ckpt-0' output_directory = 'output/' output_checkpoint_dir = os.path.join(output_directory, 'checkpoint') configs = config_util.get_configs_from_pipeline_file(pipeline_config) model_config = configs['model'] model_config.ssd.num_classes = num_classes model_config.ssd.freeze_batchnorm = True detection_model = model_builder.build( model_config=model_config, is_training=True) # Save new pipeline config pipeline_proto = config_util.create_pipeline_proto_from_configs(configs) config_util.save_pipeline_config(pipeline_proto, output_directory) fake_box_predictor = tf.compat.v2.train.Checkpoint( _base_tower_layers_for_heads=detection_model._box_predictor._base_tower_layers_for_heads, # _prediction_heads=detection_model._box_predictor._prediction_heads, # (i.e., the classification head that we *will not* restore) _box_prediction_head=detection_model._box_predictor._box_prediction_head, ) fake_model = tf.compat.v2.train.Checkpoint( _feature_extractor=detection_model._feature_extractor, _box_predictor=fake_box_predictor) ckpt = tf.compat.v2.train.Checkpoint(model=fake_model) ckpt.restore(checkpoint_path).expect_partial() # To save checkpoint for TFLite conversion. exported_ckpt = tf.compat.v2.train.Checkpoint(model=detection_model) ckpt_manager = tf.train.CheckpointManager(
def create_estimator_and_inputs(run_config, hparams, pipeline_config_path, train_steps=None, eval_steps=None, model_fn_creator=create_model_fn, use_tpu_estimator=False, use_tpu=False, num_shards=1, params=None, **kwargs): """Creates `Estimator`, input functions, and steps. Args: run_config: A `RunConfig`. hparams: A `HParams`. pipeline_config_path: A path to a pipeline config file. train_steps: Number of training steps. If None, the number of training steps is set from the `TrainConfig` proto. eval_steps: Number of evaluation steps per evaluation cycle. If None, the number of evaluation steps is set from the `EvalConfig` proto. model_fn_creator: A function that creates a `model_fn` for `Estimator`. Follows the signature: * Args: * `detection_model_fn`: Function that returns `DetectionModel` instance. * `configs`: Dictionary of pipeline config objects. * `hparams`: `HParams` object. * Returns: `model_fn` for `Estimator`. use_tpu_estimator: Whether a `TPUEstimator` should be returned. If False, an `Estimator` will be returned. use_tpu: Boolean, whether training and evaluation should run on TPU. Only used if `use_tpu_estimator` is True. num_shards: Number of shards (TPU cores). Only used if `use_tpu_estimator` is True. params: Parameter dictionary passed from the estimator. Only used if `use_tpu_estimator` is True. **kwargs: Additional keyword arguments for configuration override. Returns: A dictionary with the following fields: 'estimator': An `Estimator` or `TPUEstimator`. 'train_input_fn': A training input function. 'eval_input_fn': An evaluation input function. 'eval_on_train_input_fn': An evaluation-on-train input function. 'predict_input_fn': A prediction input function. 'train_steps': Number of training steps. Either directly from input or from configuration. 'eval_steps': Number of evaluation steps. Either directly from input or from configuration. """ get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[ 'get_configs_from_pipeline_file'] merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[ 'merge_external_params_with_configs'] create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[ 'create_pipeline_proto_from_configs'] create_train_input_fn = MODEL_BUILD_UTIL_MAP['create_train_input_fn'] create_eval_input_fn = MODEL_BUILD_UTIL_MAP['create_eval_input_fn'] create_predict_input_fn = MODEL_BUILD_UTIL_MAP['create_predict_input_fn'] configs = get_configs_from_pipeline_file(pipeline_config_path) configs = merge_external_params_with_configs( configs, hparams, train_steps=train_steps, eval_steps=eval_steps, retain_original_images_in_eval=False if use_tpu else True, **kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] eval_config = configs['eval_config'] eval_input_config = configs['eval_input_config'] # update train_steps from config but only when non-zero value is provided if train_steps is None and train_config.num_steps != 0: train_steps = train_config.num_steps # update eval_steps from config but only when non-zero value is provided if eval_steps is None and eval_config.num_examples != 0: eval_steps = eval_config.num_examples detection_model_fn = functools.partial(model_builder.build, model_config=model_config) # Create the input functions for TRAIN/EVAL/PREDICT. train_input_fn = create_train_input_fn( train_config=train_config, train_input_config=train_input_config, model_config=model_config) eval_input_fn = create_eval_input_fn(eval_config=eval_config, eval_input_config=eval_input_config, model_config=model_config) eval_on_train_input_fn = create_eval_input_fn( eval_config=eval_config, eval_input_config=train_input_config, model_config=model_config) predict_input_fn = create_predict_input_fn( model_config=model_config, predict_input_config=eval_input_config) tf.logging.info('create_estimator_and_inputs: use_tpu %s', use_tpu) model_fn = model_fn_creator(detection_model_fn, configs, hparams, use_tpu) if use_tpu_estimator: estimator = tf.contrib.tpu.TPUEstimator( model_fn=model_fn, train_batch_size=train_config.batch_size, # For each core, only batch size 1 is supported for eval. eval_batch_size=num_shards * 1 if use_tpu else 1, use_tpu=use_tpu, config=run_config, # TODO(lzc): Remove conditional after CMLE moves to TF 1.9 params=params if params else {}) else: estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) # Write the as-run pipeline config to disk. if run_config.is_chief: pipeline_config_final = create_pipeline_proto_from_configs(configs) config_util.save_pipeline_config(pipeline_config_final, estimator.model_dir) return dict(estimator=estimator, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, eval_on_train_input_fn=eval_on_train_input_fn, predict_input_fn=predict_input_fn, train_steps=train_steps, eval_steps=eval_steps)
def create_estimator_and_inputs(run_config, hparams=None, pipeline_config_path=None, config_override=None, train_steps=None, sample_1_of_n_eval_examples=1, sample_1_of_n_eval_on_train_examples=1, model_fn_creator=create_model_fn, use_tpu_estimator=False, use_tpu=False, num_shards=1, params=None, override_eval_num_epochs=True, save_final_config=False, postprocess_on_cpu=False, export_to_tpu=None, **kwargs): """Creates `Estimator`, input functions, and steps. Args: run_config: A `RunConfig`. hparams: (optional) A `HParams`. pipeline_config_path: A path to a pipeline config file. config_override: A pipeline_pb2.TrainEvalPipelineConfig text proto to override the config from `pipeline_config_path`. train_steps: Number of training steps. If None, the number of training steps is set from the `TrainConfig` proto. sample_1_of_n_eval_examples: Integer representing how often an eval example should be sampled. If 1, will sample all examples. sample_1_of_n_eval_on_train_examples: Similar to `sample_1_of_n_eval_examples`, except controls the sampling of training data for evaluation. model_fn_creator: A function that creates a `model_fn` for `Estimator`. Follows the signature: * Args: * `detection_model_fn`: Function that returns `DetectionModel` instance. * `configs`: Dictionary of pipeline config objects. * `hparams`: `HParams` object. * Returns: `model_fn` for `Estimator`. use_tpu_estimator: Whether a `TPUEstimator` should be returned. If False, an `Estimator` will be returned. use_tpu: Boolean, whether training and evaluation should run on TPU. Only used if `use_tpu_estimator` is True. num_shards: Number of shards (TPU cores). Only used if `use_tpu_estimator` is True. params: Parameter dictionary passed from the estimator. Only used if `use_tpu_estimator` is True. override_eval_num_epochs: Whether to overwrite the number of epochs to 1 for eval_input. save_final_config: Whether to save final config (obtained after applying overrides) to `estimator.model_dir`. postprocess_on_cpu: When use_tpu and postprocess_on_cpu are true, postprocess is scheduled on the host cpu. export_to_tpu: When use_tpu and export_to_tpu are true, `export_savedmodel()` exports a metagraph for serving on TPU besides the one on CPU. **kwargs: Additional keyword arguments for configuration override. Returns: A dictionary with the following fields: 'estimator': An `Estimator` or `TPUEstimator`. 'train_input_fn': A training input function. 'eval_input_fns': A list of all evaluation input functions. 'eval_input_names': A list of names for each evaluation input. 'eval_on_train_input_fn': An evaluation-on-train input function. 'predict_input_fn': A prediction input function. 'train_steps': Number of training steps. Either directly from input or from configuration. """ get_configs_from_pipeline_file = MODEL_BUILD_UTIL_MAP[ 'get_configs_from_pipeline_file'] merge_external_params_with_configs = MODEL_BUILD_UTIL_MAP[ 'merge_external_params_with_configs'] create_pipeline_proto_from_configs = MODEL_BUILD_UTIL_MAP[ 'create_pipeline_proto_from_configs'] create_train_input_fn = MODEL_BUILD_UTIL_MAP['create_train_input_fn'] create_eval_input_fn = MODEL_BUILD_UTIL_MAP['create_eval_input_fn'] create_predict_input_fn = MODEL_BUILD_UTIL_MAP['create_predict_input_fn'] detection_model_fn_base = MODEL_BUILD_UTIL_MAP['detection_model_fn_base'] configs = get_configs_from_pipeline_file(pipeline_config_path, config_override=config_override) kwargs.update({ 'train_steps': train_steps, 'use_bfloat16': configs['train_config'].use_bfloat16 and use_tpu }) if sample_1_of_n_eval_examples >= 1: kwargs.update( {'sample_1_of_n_eval_examples': sample_1_of_n_eval_examples}) if override_eval_num_epochs: kwargs.update({'eval_num_epochs': 1}) tf.logging.warning( 'Forced number of epochs for all eval validations to be 1.') configs = merge_external_params_with_configs(configs, hparams, kwargs_dict=kwargs) model_config = configs['model'] train_config = configs['train_config'] train_input_config = configs['train_input_config'] eval_config = configs['eval_config'] eval_input_configs = configs['eval_input_configs'] eval_on_train_input_config = copy.deepcopy(train_input_config) eval_on_train_input_config.sample_1_of_n_examples = ( sample_1_of_n_eval_on_train_examples) if override_eval_num_epochs and eval_on_train_input_config.num_epochs != 1: tf.logging.warning('Expected number of evaluation epochs is 1, but ' 'instead encountered `eval_on_train_input_config' '.num_epochs` = ' '{}. Overwriting `num_epochs` to 1.'.format( eval_on_train_input_config.num_epochs)) eval_on_train_input_config.num_epochs = 1 # update train_steps from config but only when non-zero value is provided if train_steps is None and train_config.num_steps != 0: train_steps = train_config.num_steps detection_model_fn = functools.partial(detection_model_fn_base, model_config=model_config) # Create the input functions for TRAIN/EVAL/PREDICT. train_input_fn = create_train_input_fn( train_config=train_config, train_input_config=train_input_config, model_config=model_config) eval_input_fns = [ create_eval_input_fn(eval_config=eval_config, eval_input_config=eval_input_config, model_config=model_config) for eval_input_config in eval_input_configs ] eval_input_names = [ eval_input_config.name for eval_input_config in eval_input_configs ] eval_on_train_input_fn = create_eval_input_fn( eval_config=eval_config, eval_input_config=eval_on_train_input_config, model_config=model_config) predict_input_fn = create_predict_input_fn( model_config=model_config, predict_input_config=eval_input_configs[0]) # Read export_to_tpu from hparams if not passed. if export_to_tpu is None and hparams is not None: export_to_tpu = hparams.get('export_to_tpu', False) tf.logging.info( 'create_estimator_and_inputs: use_tpu %s, export_to_tpu %s', use_tpu, export_to_tpu) model_fn = model_fn_creator(detection_model_fn, configs, hparams, use_tpu, postprocess_on_cpu) if use_tpu_estimator: estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn, train_batch_size=train_config.batch_size, # For each core, only batch size 1 is supported for eval. eval_batch_size=num_shards * 1 if use_tpu else 1, use_tpu=use_tpu, config=run_config, export_to_tpu=export_to_tpu, eval_on_tpu=False, # Eval runs on CPU, so disable eval on TPU params=params if params else {}) else: estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config) # Write the as-run pipeline config to disk. if run_config.is_chief and save_final_config: pipeline_config_final = create_pipeline_proto_from_configs(configs) config_util.save_pipeline_config(pipeline_config_final, estimator.model_dir) return dict(estimator=estimator, train_input_fn=train_input_fn, eval_input_fns=eval_input_fns, eval_input_names=eval_input_names, eval_on_train_input_fn=eval_on_train_input_fn, predict_input_fn=predict_input_fn, train_steps=train_steps)
def update_pipeline_config(params, eval_type): cfg = config_util.get_configs_from_pipeline_file( os.path.join(params.config_mnt, params.config_dir)) # update num_of_classes model_name = os.path.basename( os.path.normpath(os.path.join(params.config_mnt, params.config_dir))).lower() print("model name: ", model_name) if model_name.startswith("ssd"): model_cfg = cfg['model'].ssd elif model_name.startswith("faster_rcnn"): model_cfg = cfg['model'].faster_rcnn else: raise ValueError( 'unknown base model {}, we can only handle ssd or faster_rcnn'. format(model_name)) label_map = os.path.join(params.config_mnt, params.label_dir) label_map_dict = label_map_util.get_label_map_dict(label_map) num_classes = len(label_map_dict) model_cfg.num_classes = num_classes # update base_model_dir train_cfg = cfg['train_config'] train_cfg.fine_tune_checkpoint = os.path.join(params.config_mnt, params.transfer_learning_dir, 'model.ckpt') eval_cfg = cfg['eval_config'] eval_cfg.max_evals = 1 eval_cfg.num_examples = int(params.eval_num_examples) # update num_train_steps, label_map_path, train_tfrecords, val_tfrecords, batch size\ print( os.path.join(os.path.sep, params.base_mnt, params.source_data_name, 'tf_records', 'train.record')) hparams = tf.contrib.training.HParams( batch_size=int(params.batch_size), train_steps=int(params.num_steps), label_map_path=label_map, train_input_path=os.path.join(os.path.sep, params.base_mnt, params.source_data_name, 'tf_records', 'train.record'), eval_input_path=os.path.join(os.path.sep, params.base_mnt, params.source_data_name, 'tf_records', eval_type + '.record'), ) cfg = config_util.merge_external_params_with_configs(cfg, hparams) # log metrics run_context = Run.get_context() run_context.log("Batch Size", int(params.batch_size)) run_context.log("Training Steps", int(params.num_steps)) # run.log("Maximum Evaluations",max_evals) updated_pipeline_config = config_util.create_pipeline_proto_from_configs( cfg) print("updated_pipeline_config: ", updated_pipeline_config) updated_pipeline_config_file = os.path.join(params.config_mnt, params.config_dir) print("updated_pipeline_config_file: ", updated_pipeline_config_file) print("dir name: ", os.path.dirname(os.path.join(params.config_mnt, params.config_dir))) config_util.save_pipeline_config( updated_pipeline_config, os.path.join(params.base_mnt, params.source_data_name, 'model_config')) return updated_pipeline_config, updated_pipeline_config_file