def main(_): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) params = train_utils.parse_configuration(FLAGS) model_dir = FLAGS.model_dir if 'train' in FLAGS.mode: # Pure eval modes do not output yaml files. Otherwise continuous eval job # may race against the train job for writing the same file. train_utils.serialize_config(params, model_dir) # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16' # can have significant impact on model speeds by utilizing float16 in case of # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when # dtype is float16 if params.runtime.mixed_precision_dtype: performance.set_mixed_precision_policy( params.runtime.mixed_precision_dtype) distribution_strategy = distribute_utils.get_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, all_reduce_alg=params.runtime.all_reduce_alg, num_gpus=params.runtime.num_gpus, tpu_address=params.runtime.tpu) with distribution_strategy.scope(): task = task_factory.get_task(params.task, logging_dir=model_dir) train_lib.run_experiment(distribution_strategy=distribution_strategy, task=task, mode=FLAGS.mode, params=params, model_dir=model_dir) train_utils.save_gin_config(FLAGS.mode, model_dir)
def main(_): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) params = train_utils.parse_configuration(FLAGS) model_dir = FLAGS.model_dir train_utils.serialize_config(params, model_dir) run_continuous_finetune(FLAGS.mode, params, model_dir, FLAGS.pretrain_steps)
def test_recovery_nan_error(self, distribution_strategy, flag_mode): model_dir = self.get_temp_dir() flags_dict = dict(experiment='mock', mode=flag_mode, model_dir=model_dir, params_override=json.dumps(self._test_config)) with flagsaver.flagsaver(**flags_dict): params = train_utils.parse_configuration(flags.FLAGS) train_utils.serialize_config(params, model_dir) with distribution_strategy.scope(): # task = task_factory.get_task(params.task, logging_dir=model_dir) task = mock_task.MockTask(params.task, logging_dir=model_dir) # Set the loss to NaN to trigger RunTimeError. def build_losses(labels, model_outputs, aux_losses=None): del labels, model_outputs return tf.constant([np.nan], tf.float32) + aux_losses task.build_losses = build_losses with self.assertRaises(RuntimeError): train_lib.run_experiment( distribution_strategy=distribution_strategy, task=task, mode=flag_mode, params=params, model_dir=model_dir)
def test_end_to_end(self, distribution_strategy, flag_mode, run_post_eval): model_dir = self.get_temp_dir() flags_dict = dict(experiment='mock', mode=flag_mode, model_dir=model_dir, params_override=json.dumps(self._test_config)) with flagsaver.flagsaver(**flags_dict): params = train_utils.parse_configuration(flags.FLAGS) train_utils.serialize_config(params, model_dir) with distribution_strategy.scope(): task = task_factory.get_task(params.task, logging_dir=model_dir) _, logs = train_lib.run_experiment( distribution_strategy=distribution_strategy, task=task, mode=flag_mode, params=params, model_dir=model_dir, run_post_eval=run_post_eval) if run_post_eval: self.assertNotEmpty(logs) else: self.assertEmpty(logs) self.assertNotEmpty( tf.io.gfile.glob(os.path.join(model_dir, 'params.yaml'))) if flag_mode != 'eval': self.assertNotEmpty( tf.io.gfile.glob(os.path.join(model_dir, 'checkpoint')))
def main(_): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) params = train_utils.parse_configuration(FLAGS) model_dir = FLAGS.model_dir if "train" in FLAGS.mode: train_utils.serialize_config(params, model_dir) if params.runtime.mixed_precision_dtype: performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype) distribution_strategy = distribute_utils.get_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, all_reduce_alg=params.runtime.all_reduce_alg, num_gpus=params.runtime.num_gpus, tpu_address=params.runtime.tpu, **params.runtime.model_parallelism()) with distribution_strategy.scope(): if params.task.use_crf: task = ap_parsing_task.APParsingTaskCRF(params.task) else: task = ap_parsing_task.APParsingTaskBase(params.task) ckpt_exporter = train_utils.maybe_create_best_ckpt_exporter( params, model_dir) trainer = train_utils.create_trainer( params, task, train="train" in FLAGS.mode, evaluate=("eval" in FLAGS.mode), checkpoint_exporter=ckpt_exporter) model, _ = train_lib.run_experiment( distribution_strategy=distribution_strategy, task=task, mode=FLAGS.mode, params=params, trainer=trainer, model_dir=model_dir) train_utils.save_gin_config(FLAGS.mode, model_dir) # Export saved model. if "train" in FLAGS.mode: saved_model_path = os.path.join(model_dir, "saved_models/latest") logging.info("Exporting SavedModel to %s", saved_model_path) tf.saved_model.save(model, saved_model_path) if ckpt_exporter: logging.info("Loading best checkpoint for export") trainer.checkpoint.restore(ckpt_exporter.best_ckpt_path) saved_model_path = os.path.join(model_dir, "saved_models/best") # Make sure restored and not re-initialized. if trainer.global_step > 0: logging.info( "Exporting best saved model by %s (from global step: %d) to %s", params.trainer.best_checkpoint_eval_metric, trainer.global_step.numpy(), saved_model_path) tf.saved_model.save(trainer.model, saved_model_path)
def main(_): # TODO(b/177863554): consolidate to nlp/train.py gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) params = train_utils.parse_configuration(FLAGS) model_dir = FLAGS.model_dir train_utils.serialize_config(params, model_dir) continuous_finetune_lib.run_continuous_finetune( FLAGS.mode, params, model_dir, pretrain_steps=FLAGS.pretrain_steps) train_utils.save_gin_config(FLAGS.mode, model_dir)
def main(_): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) params = train_utils.parse_configuration(FLAGS) model_dir = FLAGS.model_dir if 'train' in FLAGS.mode: # Pure eval modes do not output yaml files. Otherwise continuous eval job # may race against the train job for writing the same file. train_utils.serialize_config(params, model_dir) if 'train_and_eval' in FLAGS.mode: assert ( params.task.train_data.feature_shape == params.task.validation_data.feature_shape), ( f'train {params.task.train_data.feature_shape} != validate ' f'{params.task.validation_data.feature_shape}') if 'assemblenet' in FLAGS.experiment: if 'eval' in FLAGS.mode: # Use the feature shape in validation_data for all jobs. The number of # frames in train_data will be used to construct the Assemblenet model. params.task.model.backbone.assemblenet.num_frames = params.task.validation_data.feature_shape[ 0] shape = params.task.validation_data.feature_shape else: params.task.model.backbone.assemblenet.num_frames = params.task.train_data.feature_shape[ 0] shape = params.task.train_data.feature_shape logging.info('mode %r num_frames %r feature shape %r', FLAGS.mode, params.task.model.backbone.assemblenet.num_frames, shape) # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16' # can have significant impact on model speeds by utilizing float16 in case of # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when # dtype is float16 if params.runtime.mixed_precision_dtype: performance.set_mixed_precision_policy( params.runtime.mixed_precision_dtype) distribution_strategy = distribute_utils.get_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, all_reduce_alg=params.runtime.all_reduce_alg, num_gpus=params.runtime.num_gpus, tpu_address=params.runtime.tpu) with distribution_strategy.scope(): task = task_factory.get_task(params.task, logging_dir=model_dir) train_lib.run_experiment(distribution_strategy=distribution_strategy, task=task, mode=FLAGS.mode, params=params, model_dir=model_dir) train_utils.save_gin_config(FLAGS.mode, model_dir)
def main(_): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) params = train_utils.parse_configuration(FLAGS) model_dir = FLAGS.model_dir if 'train' in FLAGS.mode: # Pure eval modes do not output yaml files. Otherwise continuous eval job # may race against the train job for writing the same file. train_utils.serialize_config(params, model_dir) # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16' # can have significant impact on model speeds by utilizing float16 in case of # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when # dtype is float16 if params.runtime.mixed_precision_dtype: performance.set_mixed_precision_policy( params.runtime.mixed_precision_dtype, params.runtime.loss_scale, use_experimental_api=True) input_partition_dims = None if FLAGS.mode == 'train_and_eval': if np.prod(params.task.train_input_partition_dims) != np.prod( params.task.eval_input_partition_dims): raise ValueError('Train and eval input partition dims can not be' 'partitioned on the same node') else: input_partition_dims = get_computation_shape_for_model_parallelism( params.task.train_input_partition_dims) elif FLAGS.mode == 'train': if params.task.train_input_partition_dims: input_partition_dims = get_computation_shape_for_model_parallelism( params.task.train_input_partition_dims) elif FLAGS.mode == 'eval' or FLAGS.mode == 'continuous_eval': if params.task.eval_input_partition_dims: input_partition_dims = get_computation_shape_for_model_parallelism( params.task.eval_input_partition_dims) distribution_strategy = create_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, num_gpus=params.runtime.num_gpus, input_partition_dims=input_partition_dims, tpu_address=params.runtime.tpu) with distribution_strategy.scope(): task = task_factory.get_task(params.task, logging_dir=model_dir) train_lib.run_experiment(distribution_strategy=distribution_strategy, task=task, mode=FLAGS.mode, params=params, model_dir=model_dir)
def main(_): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) params = train_utils.parse_configuration(FLAGS) model_dir = FLAGS.model_dir if 'train' in FLAGS.mode: # Pure eval modes do not output yaml files. Otherwise continuous eval job # may race against the train job for writing the same file. train_utils.serialize_config(params, model_dir) # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16' # can have significant impact on model speeds by utilizing float16 in case of # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when # dtype is float16 if params.runtime.mixed_precision_dtype: performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype, <<<<<<< HEAD params.runtime.loss_scale)
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') superglue_flags.validate_flags(FLAGS, file_exists_fn=tf.io.gfile.exists) gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) distribution_strategy = distribute_utils.get_distribution_strategy( distribution_strategy=FLAGS.distribution_strategy, num_gpus=FLAGS.num_gpus, tpu_address=FLAGS.tpu) with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader: input_meta_data = json.loads(reader.read().decode('utf-8')) with distribution_strategy.scope(): task = None if 'train_eval' in FLAGS.mode: logging.info('Starting training and eval...') logging.info('Model dir: %s', FLAGS.model_dir) exp_config = _get_exp_config(input_meta_data=input_meta_data, exp_config_files=FLAGS.config_file) train_utils.serialize_config(exp_config, FLAGS.model_dir) task = task_factory.get_task(exp_config.task, logging_dir=FLAGS.model_dir) train_lib.run_experiment( distribution_strategy=distribution_strategy, task=task, mode='train_and_eval', params=exp_config, model_dir=FLAGS.model_dir) if 'predict' in FLAGS.mode: logging.info('Starting predict...') # When mode is `predict`, `task` will be None. if task is None: exp_config = _get_exp_config(input_meta_data=input_meta_data, exp_config_files=[ os.path.join( FLAGS.model_dir, 'params.yaml') ]) task = task_factory.get_task(exp_config.task, logging_dir=FLAGS.model_dir) _write_submission_file(task, input_meta_data['max_seq_length'])
def test_recovery(self, distribution_strategy, flag_mode): loss_threshold = 1.0 model_dir = self.get_temp_dir() flags_dict = dict(experiment='mock', mode=flag_mode, model_dir=model_dir, params_override=json.dumps(self._test_config)) with flagsaver.flagsaver(**flags_dict): params = train_utils.parse_configuration(flags.FLAGS) params.trainer.loss_upper_bound = loss_threshold params.trainer.recovery_max_trials = 1 train_utils.serialize_config(params, model_dir) with distribution_strategy.scope(): task = task_factory.get_task(params.task, logging_dir=model_dir) # Saves a checkpoint for reference. model = task.build_model() checkpoint = tf.train.Checkpoint(model=model) checkpoint_manager = tf.train.CheckpointManager( checkpoint, self.get_temp_dir(), max_to_keep=2) checkpoint_manager.save() before_weights = model.get_weights() def build_losses(labels, model_outputs, aux_losses=None): del labels, model_outputs return tf.constant([loss_threshold], tf.float32) + aux_losses task.build_losses = build_losses model, _ = train_lib.run_experiment( distribution_strategy=distribution_strategy, task=task, mode=flag_mode, params=params, model_dir=model_dir) after_weights = model.get_weights() for left, right in zip(before_weights, after_weights): self.assertAllEqual(left, right)
def config_override(params, flags_obj): """Override ExperimentConfig according to flags.""" # Change runtime.tpu to the real tpu. params.override({'runtime': { 'tpu': flags_obj.tpu, }}) # Get the first level of override from `--config_file`. # `--config_file` is typically used as a template that specifies the common # override for a particular experiment. for config_file in flags_obj.config_file or []: params = hyperparams.override_params_dict(params, config_file, is_strict=True) # Get the second level of override from `--params_override`. # `--params_override` is typically used as a further override over the # template. For example, one may define a particular template for training # ResNet50 on ImageNet in a config file and pass it via `--config_file`, # then define different learning rates and pass it via `--params_override`. if flags_obj.params_override: params = hyperparams.override_params_dict(params, flags_obj.params_override, is_strict=True) params.validate() params.lock() pp = pprint.PrettyPrinter() logging.info('Final experiment parameters: %s', pp.pformat(params.as_dict())) model_dir = flags_obj.model_dir if 'train' in flags_obj.mode: # Pure eval modes do not output yaml files. Otherwise continuous eval job # may race against the train job for writing the same file. train_utils.serialize_config(params, model_dir) return params
def main(_) -> None: """Train and evaluate the Ranking model.""" params = train_utils.parse_configuration(FLAGS) mode = FLAGS.mode model_dir = FLAGS.model_dir if 'train' in FLAGS.mode: # Pure eval modes do not output yaml files. Otherwise continuous eval job # may race against the train job for writing the same file. train_utils.serialize_config(params, model_dir) if FLAGS.seed is not None: logging.info('Setting tf seed.') tf.random.set_seed(FLAGS.seed) task = RankingTask( params=params.task, optimizer_config=params.trainer.optimizer_config, logging_dir=model_dir, steps_per_execution=params.trainer.steps_per_loop, name='RankingTask') enable_tensorboard = params.trainer.callbacks.enable_tensorboard strategy = distribute_utils.get_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, all_reduce_alg=params.runtime.all_reduce_alg, num_gpus=params.runtime.num_gpus, tpu_address=params.runtime.tpu) with strategy.scope(): model = task.build_model() def get_dataset_fn(params): return lambda input_context: task.build_inputs(params, input_context) train_dataset = None if 'train' in mode: train_dataset = strategy.distribute_datasets_from_function( get_dataset_fn(params.task.train_data), options=tf.distribute.InputOptions(experimental_fetch_to_device=False)) validation_dataset = None if 'eval' in mode: validation_dataset = strategy.distribute_datasets_from_function( get_dataset_fn(params.task.validation_data), options=tf.distribute.InputOptions(experimental_fetch_to_device=False)) if params.trainer.use_orbit: with strategy.scope(): checkpoint_exporter = train_utils.maybe_create_best_ckpt_exporter( params, model_dir) trainer = RankingTrainer( config=params, task=task, model=model, optimizer=model.optimizer, train='train' in mode, evaluate='eval' in mode, train_dataset=train_dataset, validation_dataset=validation_dataset, checkpoint_exporter=checkpoint_exporter) train_lib.run_experiment( distribution_strategy=strategy, task=task, mode=mode, params=params, model_dir=model_dir, trainer=trainer) else: # Compile/fit checkpoint = tf.train.Checkpoint(model=model, optimizer=model.optimizer) latest_checkpoint = tf.train.latest_checkpoint(model_dir) if latest_checkpoint: checkpoint.restore(latest_checkpoint) logging.info('Loaded checkpoint %s', latest_checkpoint) checkpoint_manager = tf.train.CheckpointManager( checkpoint, directory=model_dir, max_to_keep=params.trainer.max_to_keep, step_counter=model.optimizer.iterations, checkpoint_interval=params.trainer.checkpoint_interval) checkpoint_callback = keras_utils.SimpleCheckpoint(checkpoint_manager) time_callback = keras_utils.TimeHistory( params.task.train_data.global_batch_size, params.trainer.time_history.log_steps, logdir=model_dir if enable_tensorboard else None) callbacks = [checkpoint_callback, time_callback] if enable_tensorboard: tensorboard_callback = tf.keras.callbacks.TensorBoard( log_dir=model_dir, update_freq=min(1000, params.trainer.validation_interval), profile_batch=FLAGS.profile_steps) callbacks.append(tensorboard_callback) num_epochs = (params.trainer.train_steps // params.trainer.validation_interval) current_step = model.optimizer.iterations.numpy() initial_epoch = current_step // params.trainer.validation_interval eval_steps = params.trainer.validation_steps if 'eval' in mode else None if mode in ['train', 'train_and_eval']: logging.info('Training started') history = model.fit( train_dataset, initial_epoch=initial_epoch, epochs=num_epochs, steps_per_epoch=params.trainer.validation_interval, validation_data=validation_dataset, validation_steps=eval_steps, callbacks=callbacks, ) model.summary() logging.info('Train history: %s', history.history) elif mode == 'eval': logging.info('Evaluation started') validation_output = model.evaluate(validation_dataset, steps=eval_steps) logging.info('Evaluation output: %s', validation_output) else: raise NotImplementedError('The mode is not implemented: %s' % mode)
def export(input_type: str, batch_size: Optional[int], input_image_size: List[int], params: cfg.ExperimentConfig, checkpoint_path: str, export_dir: str, num_channels: Optional[int] = 3, export_module: Optional[export_base.ExportModule] = None, export_checkpoint_subdir: Optional[str] = None, export_saved_model_subdir: Optional[str] = None, save_options: Optional[tf.saved_model.SaveOptions] = None): """Exports the model specified in the exp config. Saved model is stored at export_dir/saved_model, checkpoint is saved at export_dir/checkpoint, and params is saved at export_dir/params.yaml. Args: input_type: One of `image_tensor`, `image_bytes`, `tf_example`. batch_size: 'int', or None. input_image_size: List or Tuple of height and width. params: Experiment params. checkpoint_path: Trained checkpoint path or directory. export_dir: Export directory path. num_channels: The number of input image channels. export_module: Optional export module to be used instead of using params to create one. If None, the params will be used to create an export module. export_checkpoint_subdir: Optional subdirectory under export_dir to store checkpoint. export_saved_model_subdir: Optional subdirectory under export_dir to store saved model. save_options: `SaveOptions` for `tf.saved_model.save`. """ if export_checkpoint_subdir: output_checkpoint_directory = os.path.join(export_dir, export_checkpoint_subdir) else: output_checkpoint_directory = None if export_saved_model_subdir: output_saved_model_directory = os.path.join(export_dir, export_saved_model_subdir) else: output_saved_model_directory = export_dir export_module = export_module_factory.get_export_module( params, input_type=input_type, batch_size=batch_size, input_image_size=input_image_size, num_channels=num_channels) export_base.export(export_module, function_keys=[input_type], export_savedmodel_dir=output_saved_model_directory, checkpoint_path=checkpoint_path, timestamped=False, save_options=save_options) if output_checkpoint_directory: ckpt = tf.train.Checkpoint(model=export_module.model) ckpt.save(os.path.join(output_checkpoint_directory, 'ckpt')) train_utils.serialize_config(params, export_dir)
def export_inference_graph( input_type: str, batch_size: Optional[int], input_image_size: List[int], params: cfg.ExperimentConfig, checkpoint_path: str, export_dir: str, num_channels: Optional[int] = 3, export_module: Optional[export_base.ExportModule] = None, export_checkpoint_subdir: Optional[str] = None, export_saved_model_subdir: Optional[str] = None, save_options: Optional[tf.saved_model.SaveOptions] = None, log_model_flops_and_params: bool = False, checkpoint: Optional[tf.train.Checkpoint] = None, input_name: Optional[str] = None): """Exports inference graph for the model specified in the exp config. Saved model is stored at export_dir/saved_model, checkpoint is saved at export_dir/checkpoint, and params is saved at export_dir/params.yaml. Args: input_type: One of `image_tensor`, `image_bytes`, `tf_example` or `tflite`. batch_size: 'int', or None. input_image_size: List or Tuple of height and width. params: Experiment params. checkpoint_path: Trained checkpoint path or directory. export_dir: Export directory path. num_channels: The number of input image channels. export_module: Optional export module to be used instead of using params to create one. If None, the params will be used to create an export module. export_checkpoint_subdir: Optional subdirectory under export_dir to store checkpoint. export_saved_model_subdir: Optional subdirectory under export_dir to store saved model. save_options: `SaveOptions` for `tf.saved_model.save`. log_model_flops_and_params: If True, writes model FLOPs to model_flops.txt and model parameters to model_params.txt. checkpoint: An optional tf.train.Checkpoint. If provided, the export module will use it to read the weights. input_name: The input tensor name, default at `None` which produces input tensor name `inputs`. """ if export_checkpoint_subdir: output_checkpoint_directory = os.path.join(export_dir, export_checkpoint_subdir) else: output_checkpoint_directory = None if export_saved_model_subdir: output_saved_model_directory = os.path.join(export_dir, export_saved_model_subdir) else: output_saved_model_directory = export_dir # TODO(arashwan): Offers a direct path to use ExportModule with Task objects. if not export_module: if isinstance(params.task, configs.image_classification.ImageClassificationTask): export_module = image_classification.ClassificationModule( params=params, batch_size=batch_size, input_image_size=input_image_size, input_type=input_type, num_channels=num_channels, input_name=input_name) elif isinstance(params.task, configs.retinanet.RetinaNetTask) or isinstance( params.task, configs.maskrcnn.MaskRCNNTask): export_module = detection.DetectionModule( params=params, batch_size=batch_size, input_image_size=input_image_size, input_type=input_type, num_channels=num_channels, input_name=input_name) elif isinstance( params.task, configs.semantic_segmentation.SemanticSegmentationTask): export_module = semantic_segmentation.SegmentationModule( params=params, batch_size=batch_size, input_image_size=input_image_size, input_type=input_type, num_channels=num_channels, input_name=input_name) elif isinstance(params.task, configs.video_classification.VideoClassificationTask): export_module = video_classification.VideoClassificationModule( params=params, batch_size=batch_size, input_image_size=input_image_size, input_type=input_type, num_channels=num_channels, input_name=input_name) else: raise ValueError( 'Export module not implemented for {} task.'.format( type(params.task))) export_base.export(export_module, function_keys=[input_type], export_savedmodel_dir=output_saved_model_directory, checkpoint=checkpoint, checkpoint_path=checkpoint_path, timestamped=False, save_options=save_options) if output_checkpoint_directory: ckpt = tf.train.Checkpoint(model=export_module.model) ckpt.save(os.path.join(output_checkpoint_directory, 'ckpt')) train_utils.serialize_config(params, export_dir) if log_model_flops_and_params: inputs_kwargs = None if isinstance( params.task, (configs.retinanet.RetinaNetTask, configs.maskrcnn.MaskRCNNTask)): # We need to create inputs_kwargs argument to specify the input shapes for # subclass model that overrides model.call to take multiple inputs, # e.g., RetinaNet model. inputs_kwargs = { 'images': tf.TensorSpec([1] + input_image_size + [num_channels], tf.float32), 'image_shape': tf.TensorSpec([1, 2], tf.float32) } dummy_inputs = { k: tf.ones(v.shape.as_list(), tf.float32) for k, v in inputs_kwargs.items() } # Must do forward pass to build the model. export_module.model(**dummy_inputs) else: logging.info( 'Logging model flops and params not implemented for %s task.', type(params.task)) return train_utils.try_count_flops( export_module.model, inputs_kwargs, os.path.join(export_dir, 'model_flops.txt')) train_utils.write_model_params( export_module.model, os.path.join(export_dir, 'model_params.txt'))
def export_inference_graph( input_type: str, batch_size: Optional[int], input_image_size: List[int], params: cfg.ExperimentConfig, checkpoint_path: str, export_dir: str, num_channels: Optional[int] = 3, export_module: Optional[export_base.ExportModule] = None, export_checkpoint_subdir: Optional[str] = None, export_saved_model_subdir: Optional[str] = None, save_options: Optional[tf.saved_model.SaveOptions] = None, argmax_outputs: bool = False, visualise_outputs: bool = False, class_present_outputs: bool = False): """Exports inference graph for the model specified in the exp config. Saved model is stored at export_dir/saved_model, checkpoint is saved at export_dir/checkpoint, and params is saved at export_dir/params.yaml. Args: input_type: One of `image_tensor`, `image_bytes`, `tf_example`. batch_size: 'int', or None. input_image_size: List or Tuple of height and width. params: Experiment params. checkpoint_path: Trained checkpoint path or directory. export_dir: Export directory path. num_channels: The number of input image channels. export_module: Optional export module to be used instead of using params to create one. If None, the params will be used to create an export module. export_checkpoint_subdir: Optional subdirectory under export_dir to store checkpoint. export_saved_model_subdir: Optional subdirectory under export_dir to store saved model. save_options: `SaveOptions` for `tf.saved_model.save`. (applicable for Segmentation and MultiTask export definitions only) argmax_outputs: Set true to argmax the last channel of all outputs. visualise_outputs: Set true to apply colormap to all single channel outputs. class_present_outputs: Set true to gather unique values of outputs. """ if export_checkpoint_subdir: output_checkpoint_directory = os.path.join( export_dir, export_checkpoint_subdir) else: output_checkpoint_directory = export_dir if export_saved_model_subdir: output_saved_model_directory = os.path.join( export_dir, export_saved_model_subdir) else: output_saved_model_directory = export_dir # TODO(arashwan): Offers a direct path to use ExportModule with Task objects. if not export_module: if isinstance(params.task, configs.image_classification.ImageClassificationTask): export_module = image_classification.ClassificationModule( params=params, batch_size=batch_size, input_image_size=input_image_size, num_channels=num_channels) elif isinstance(params.task, configs.retinanet.RetinaNetTask) or isinstance( params.task, configs.maskrcnn.MaskRCNNTask): export_module = detection.DetectionModule( params=params, batch_size=batch_size, input_image_size=input_image_size, num_channels=num_channels) elif isinstance(params.task, configs.semantic_segmentation.SemanticSegmentationTask): export_module = semantic_segmentation.SegmentationModule( params=params, batch_size=batch_size, input_image_size=input_image_size, num_channels=num_channels, argmax_outputs=argmax_outputs, visualise_outputs=visualise_outputs) elif isinstance(params.task, configs.yolo.YoloTask): export_module = yolo.YoloModule( params=params, batch_size=batch_size, input_image_size=input_image_size, num_channels=num_channels) elif isinstance(params.task, multi_cfg.MultiTaskConfig): export_module = multitask.MultitaskModule( params=params, batch_size=batch_size, input_image_size=input_image_size, num_channels=num_channels, argmax_outputs=argmax_outputs, visualise_outputs=visualise_outputs, class_present_outputs=class_present_outputs) else: raise ValueError('Export module not implemented for {} task.'.format( type(params.task))) export_base.export( export_module, function_keys=[input_type], export_savedmodel_dir=output_saved_model_directory, checkpoint_path=checkpoint_path, timestamped=False, save_options=save_options) ckpt = tf.train.Checkpoint(model=export_module.model) ckpt.save(os.path.join(output_checkpoint_directory, 'ckpt')) train_utils.serialize_config(params, export_dir)
def export_inference_graph(input_type, batch_size, input_image_size, params, checkpoint_path, export_dir, export_checkpoint_subdir=None, export_saved_model_subdir=None): """Exports inference graph for the model specified in the exp config. Saved model is stored at export_dir/saved_model, checkpoint is saved at export_dir/checkpoint, and params is saved at export_dir/params.yaml. Args: input_type: One of `image_tensor`, `image_bytes`, `tf_example`. batch_size: 'int', or None. input_image_size: List or Tuple of height and width. params: Experiment params. checkpoint_path: Trained checkpoint path or directory. export_dir: Export directory path. export_checkpoint_subdir: Optional subdirectory under export_dir to store checkpoint. export_saved_model_subdir: Optional subdirectory under export_dir to store saved model. """ if export_checkpoint_subdir: output_checkpoint_directory = os.path.join(export_dir, export_checkpoint_subdir) else: output_checkpoint_directory = export_dir if export_saved_model_subdir: output_saved_model_directory = os.path.join(export_dir, export_saved_model_subdir) else: output_saved_model_directory = export_dir if isinstance(params.task, configs.image_classification.ImageClassificationTask): export_module = image_classification.ClassificationModule( params=params, batch_size=batch_size, input_image_size=input_image_size) elif isinstance(params.task, configs.retinanet.RetinaNetTask) or isinstance( params.task, configs.maskrcnn.MaskRCNNTask): export_module = detection.DetectionModule( params=params, batch_size=batch_size, input_image_size=input_image_size) else: raise ValueError('Export module not implemented for {} task.'.format( type(params.task))) model = export_module.build_model() ckpt = tf.train.Checkpoint(model=model) ckpt_dir_or_file = checkpoint_path if tf.io.gfile.isdir(ckpt_dir_or_file): ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file) status = ckpt.restore(ckpt_dir_or_file).expect_partial() if input_type == 'image_tensor': input_signature = tf.TensorSpec( shape=[batch_size, input_image_size[0], input_image_size[1], 3], dtype=tf.uint8) signatures = { 'serving_default': export_module.inference_from_image_tensors.get_concrete_function( input_signature) } elif input_type == 'image_bytes': input_signature = tf.TensorSpec(shape=[batch_size], dtype=tf.string) signatures = { 'serving_default': export_module.inference_from_image_bytes.get_concrete_function( input_signature) } elif input_type == 'tf_example': input_signature = tf.TensorSpec(shape=[batch_size], dtype=tf.string) signatures = { 'serving_default': export_module.inference_from_tf_example.get_concrete_function( input_signature) } else: raise ValueError('Unrecognized `input_type`') status.assert_existing_objects_matched() ckpt.save(os.path.join(output_checkpoint_directory, 'ckpt')) tf.saved_model.save(export_module, output_saved_model_directory, signatures=signatures) train_utils.serialize_config(params, export_dir)
def main(_): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) params = train_utils.parse_configuration(FLAGS) if params.runtime.num_hpus > 0: import os #TODO: remove when SW-49334 is fixed [SW-49404] os.environ["TF_DISABLE_EAGER_TO_FUNC_REWRITER"] = "1" from habana_frameworks.tensorflow import load_habana_module load_habana_module() if params.task.train_data.deterministic or params.task.validation_data.deterministic: import os os.environ['PYTHONHASHSEED'] = '0' os.environ['TF_DETERMINISTIC_OPS'] = '1' import numpy numpy.random.seed(0) import tensorflow as tf tf.random.set_seed(0) tf.compat.v1.set_random_seed(0) import random random.seed(0) if FLAGS.dtype == "bf16": print("Using bf16 config list {}".format(FLAGS.bf16_config_path)) os.environ['TF_BF16_CONVERSION'] = FLAGS.bf16_config_path hls_addresses = str(os.environ.get("MULTI_HLS_IPS", "127.0.0.1")).split(",") TF_BASE_PORT = 2410 mpi_rank = comm_rank() mpi_size = comm_size() if params.runtime.num_hpus > 1: model_dir = os.path.join(FLAGS.model_dir, "worker_" + str(mpi_rank)) else: model_dir = FLAGS.model_dir #prepare a comma-seperated list of device addreses worker_list = [] for address in hls_addresses: for rank in range(mpi_size // len(hls_addresses)): worker_list.append(address + ':' + str(TF_BASE_PORT + rank)) worker_hosts = ",".join(worker_list) task_index = mpi_rank # Configures cluster spec for distribution strategy. distribution_utils.configure_cluster(worker_hosts, task_index) if 'train' in FLAGS.mode: # Pure eval modes do not output yaml files. Otherwise continuous eval job # may race against the train job for writing the same file. train_utils.serialize_config(params, model_dir) # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16' # can have significant impact on model speeds by utilizing float16 in case of # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when # dtype is float16 if params.runtime.mixed_precision_dtype: performance.set_mixed_precision_policy( params.runtime.mixed_precision_dtype) distribution_strategy = distribution_utils.get_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, all_reduce_alg=params.runtime.all_reduce_alg, num_gpus=params.runtime.num_gpus, num_hpus=params.runtime.num_hpus, tpu_address=params.runtime.tpu) with distribution_strategy.scope(): task = task_factory.get_task(params.task, logging_dir=model_dir) train_lib.run_experiment(distribution_strategy=distribution_strategy, task=task, mode=FLAGS.mode, params=params, model_dir=model_dir) train_utils.save_gin_config(FLAGS.mode, model_dir)