def run_continuous_finetune( mode: str, params: config_definitions.ExperimentConfig, model_dir: str, run_post_eval: bool = False, pretrain_steps: Optional[int] = None, ) -> Mapping[str, Any]: """Run modes with continuous training. Currently only supports continuous_train_and_eval. Args: mode: A 'str', specifying the mode. continuous_train_and_eval - monitors a checkpoint directory. Once a new checkpoint is discovered, loads the checkpoint, finetune the model by training it (probably on another dataset or with another task), then evaluate the finetuned model. params: ExperimentConfig instance. model_dir: A 'str', a path to store model checkpoints and summaries. run_post_eval: Whether to run post eval once after training, metrics logs are returned. pretrain_steps: Optional, the number of total training steps for the pretraining job. Returns: eval logs: returns eval metrics logs when run_post_eval is set to True, othewise, returns {}. """ assert mode == 'continuous_train_and_eval', ( 'Only continuous_train_and_eval is supported by continuous_finetune. ' 'Got mode: {}'.format(mode)) # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16' # can have significant impact on model speeds by utilizing float16 in case of # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when # dtype is float16 if params.runtime.mixed_precision_dtype: performance.set_mixed_precision_policy( params.runtime.mixed_precision_dtype, params.runtime.loss_scale) distribution_strategy = distribute_utils.get_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, all_reduce_alg=params.runtime.all_reduce_alg, num_gpus=params.runtime.num_gpus, tpu_address=params.runtime.tpu) retry_times = 0 while not tf.io.gfile.isdir(params.task.init_checkpoint): # Wait for the init_checkpoint directory to be created. if retry_times >= 60: raise ValueError( 'ExperimentConfig.task.init_checkpoint must be a directory for ' 'continuous_train_and_eval mode.') retry_times += 1 time.sleep(60) summary_writer = tf.summary.create_file_writer( os.path.join(model_dir, 'eval')) global_step = 0 def timeout_fn(): if pretrain_steps and global_step < pretrain_steps: # Keeps waiting for another timeout period. logging.info( 'Continue waiting for new checkpoint as current pretrain ' 'global_step=%d and target is %d.', global_step, pretrain_steps) return False # Quits the loop. return True for pretrain_ckpt in tf.train.checkpoints_iterator( checkpoint_dir=params.task.init_checkpoint, min_interval_secs=10, timeout=params.trainer.continuous_eval_timeout, timeout_fn=timeout_fn): with distribution_strategy.scope(): global_step = train_utils.read_global_step_from_checkpoint( pretrain_ckpt) if params.trainer.best_checkpoint_export_subdir: best_ckpt_subdir = '{}_{}'.format( params.trainer.best_checkpoint_export_subdir, global_step) params_replaced = params.replace( task={'init_checkpoint': pretrain_ckpt}, trainer={'best_checkpoint_export_subdir': best_ckpt_subdir}) else: params_replaced = params.replace( task={'init_checkpoint': pretrain_ckpt}) params_replaced.lock() logging.info('Running finetuning with params: %s', params_replaced) with distribution_strategy.scope(): task = task_factory.get_task(params_replaced.task, logging_dir=model_dir) _, eval_metrics = train_lib.run_experiment( distribution_strategy=distribution_strategy, task=task, mode='train_and_eval', # replace params.task.init_checkpoint to make sure that we load # exactly this pretrain checkpoint. params=params_replaced, model_dir=model_dir, run_post_eval=True, save_summary=False) logging.info('Evaluation finished. Pretrain global_step: %d', global_step) train_utils.write_json_summary(model_dir, global_step, eval_metrics) if not os.path.basename(model_dir): # if model_dir.endswith('/') summary_grp = os.path.dirname( model_dir) + '_' + task.__class__.__name__ else: summary_grp = os.path.basename( model_dir) + '_' + task.__class__.__name__ summaries = {} for name, value in eval_metrics.items(): summaries[summary_grp + '/' + name] = value train_utils.write_summary(summary_writer, global_step, summaries) train_utils.remove_ckpts(model_dir) # In TF2, the resource life cycle is bound with the python object life # cycle. Force trigger python garbage collection here so those resources # can be deallocated in time, so it doesn't cause OOM when allocating new # objects. # TODO(b/169178664): Fix cycle reference in Keras model and revisit to see # if we need gc here. gc.collect() if run_post_eval: return eval_metrics return {}
def run(flags_obj): """Run ResNet ImageNet training and eval loop using custom training loops. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ keras_utils.set_session_config(enable_eager=flags_obj.enable_eager, enable_xla=flags_obj.enable_xla) performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj)) # This only affects GPU. common.set_cudnn_batchnorm_mode() # TODO(anj-s): Set data_format without using Keras. data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') tf.keras.backend.set_image_data_format(data_format) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_obj.num_gpus, all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs, tpu_address=flags_obj.tpu) per_epoch_steps, train_epochs, eval_steps = get_num_train_iterations( flags_obj) steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps) logging.info( 'Training %d epochs, each epoch has %d steps, ' 'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps, train_epochs * per_epoch_steps, eval_steps) time_callback = keras_utils.TimeHistory( flags_obj.batch_size, flags_obj.log_steps, logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None) with distribution_utils.get_strategy_scope(strategy): runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback, per_epoch_steps) eval_interval = (flags_obj.epochs_between_evals * per_epoch_steps if not flags_obj.skip_eval else None) checkpoint_interval = (per_epoch_steps if flags_obj.enable_checkpoint_and_export else None) summary_interval = per_epoch_steps if flags_obj.enable_tensorboard else None checkpoint_manager = tf.train.CheckpointManager( runnable.checkpoint, directory=flags_obj.model_dir, max_to_keep=10, step_counter=runnable.global_step, checkpoint_interval=checkpoint_interval) resnet_controller = controller.Controller( strategy, runnable.train, runnable.evaluate, global_step=runnable.global_step, steps_per_loop=steps_per_loop, train_steps=per_epoch_steps * train_epochs, checkpoint_manager=checkpoint_manager, summary_interval=summary_interval, eval_steps=eval_steps, eval_interval=eval_interval) time_callback.on_train_begin() resnet_controller.train(evaluate=True) time_callback.on_train_end() stats = build_stats(runnable, time_callback) return stats
def run_train(flags_obj): keras_utils.set_session_config(enable_eager=flags_obj.enable_eager, enable_xla=flags_obj.enable_xla) # Execute flag override logic for better model performance if flags_obj.tf_gpu_thread_mode: keras_utils.set_gpu_thread_mode_and_count( per_gpu_thread_count=flags_obj.per_gpu_thread_count, gpu_thread_mode=flags_obj.tf_gpu_thread_mode, num_gpus=flags_obj.num_gpus, datasets_num_private_threads=flags_obj.datasets_num_private_threads ) common.set_cudnn_batchnorm_mode() performance.set_mixed_precision_policy( flags_core.get_tf_dtype(flags_obj), flags_core.get_loss_scale(flags_obj, default_for_fp16=128)) data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') tf.keras.backend.set_image_data_format(data_format) # Configures cluster spec for distribution strategy. _ = distribution_utils.configure_cluster(flags_obj.worker_hosts, flags_obj.task_index) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_obj.num_gpus, all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs, tpu_address=flags_obj.tpu) if strategy: # flags_obj.enable_get_next_as_optional controls whether enabling # get_next_as_optional behavior in DistributedIterator. If true, last # partial batch can be supported. strategy.extended.experimental_enable_get_next_as_optional = ( flags_obj.enable_get_next_as_optional) strategy_scope = distribution_utils.get_strategy_scope(strategy) distribution_utils.undo_set_up_synthetic_data() train_input_dataset, eval_input_dataset, tr_dataset, te_dataset = setup_datasets( flags_obj) lr_schedule = common.PiecewiseConstantDecayWithWarmup( batch_size=GB_OPTIONS.batch_size, epoch_size=tr_dataset.num_examples_per_epoch(), warmup_epochs=common.LR_SCHEDULE[0][1], boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]), multipliers=list(p[0] for p in common.LR_SCHEDULE), compute_lr_on_cpu=True) with strategy_scope: optimizer = common.get_optimizer(lr_schedule) model = build_model(tr_dataset.num_classes, mode='resnet50') if GB_OPTIONS.pretrained_filepath is not None: latest = tf.train.latest_checkpoint(GB_OPTIONS.pretrained_filepath) print(latest) model.load_weights(latest) #losses = ["sparse_categorical_crossentropy"] #lossWeights = [1.0] model.compile( optimizer=optimizer, loss="sparse_categorical_crossentropy", #loss_weights=lossWeights, metrics=['sparse_categorical_accuracy']) num_train_examples = tr_dataset.num_examples_per_epoch() steps_per_epoch = num_train_examples // GB_OPTIONS.batch_size train_epochs = GB_OPTIONS.num_epochs if not hasattr(tr_dataset, "n_poison"): n_poison = 0 n_cover = 0 else: n_poison = tr_dataset.n_poison n_cover = tr_dataset.n_cover callbacks = common.get_callbacks( steps_per_epoch=steps_per_epoch, pruning_method=flags_obj.pruning_method, enable_checkpoint_and_export=False, model_dir=GB_OPTIONS.checkpoint_folder) ckpt_full_path = os.path.join( GB_OPTIONS.checkpoint_folder, 'model.ckpt-{epoch:04d}-p%d-c%d' % (n_poison, n_cover)) callbacks.append( tf.keras.callbacks.ModelCheckpoint(ckpt_full_path, save_weights_only=True, save_best_only=True)) num_eval_examples = te_dataset.num_examples_per_epoch() num_eval_steps = num_eval_examples // GB_OPTIONS.batch_size if flags_obj.skip_eval: # Only build the training graph. This reduces memory usage introduced by # control flow ops in layers that have different implementations for # training and inference (e.g., batch norm). if flags_obj.set_learning_phase_to_train: # TODO(haoyuzhang): Understand slowdown of setting learning phase when # not using distribution strategy. tf.keras.backend.set_learning_phase(1) num_eval_steps = None eval_input_dataset = None history = model.fit(train_input_dataset, epochs=train_epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks, validation_steps=num_eval_steps, validation_data=eval_input_dataset, validation_freq=flags_obj.epochs_between_evals) export_path = os.path.join(GB_OPTIONS.checkpoint_folder, 'saved_model') model.save(export_path, include_optimizer=False) eval_output = model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=2) stats = common.build_stats(history, eval_output, callbacks) return stats
def run_predict(flags_obj, datasets_override=None, strategy_override=None): keras_utils.set_session_config(enable_eager=flags_obj.enable_eager, enable_xla=flags_obj.enable_xla) # Execute flag override logic for better model performance if flags_obj.tf_gpu_thread_mode: keras_utils.set_gpu_thread_mode_and_count( per_gpu_thread_count=flags_obj.per_gpu_thread_count, gpu_thread_mode=flags_obj.tf_gpu_thread_mode, num_gpus=flags_obj.num_gpus, datasets_num_private_threads=flags_obj.datasets_num_private_threads ) common.set_cudnn_batchnorm_mode() performance.set_mixed_precision_policy( flags_core.get_tf_dtype(flags_obj), flags_core.get_loss_scale(flags_obj, default_for_fp16=128)) data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') tf.keras.backend.set_image_data_format(data_format) # Configures cluster spec for distribution strategy. _ = distribution_utils.configure_cluster(flags_obj.worker_hosts, flags_obj.task_index) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_obj.num_gpus, all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs, tpu_address=flags_obj.tpu) if strategy: # flags_obj.enable_get_next_as_optional controls whether enabling # get_next_as_optional behavior in DistributedIterator. If true, last # partial batch can be supported. strategy.extended.experimental_enable_get_next_as_optional = ( flags_obj.enable_get_next_as_optional) strategy_scope = distribution_utils.get_strategy_scope(strategy) distribution_utils.undo_set_up_synthetic_data() train_input_dataset, eval_input_dataset, tr_dataset, te_dataset = setup_datasets( flags_obj, shuffle=False) pred_input_dataset, pred_dataset = eval_input_dataset, te_dataset with strategy_scope: #model = build_model(tr_dataset.num_classes, mode='resnet50_features') model = build_model(100, mode='resnet50_features') load_path = GB_OPTIONS.pretrained_filepath if load_path is None: load_path = GB_OPTIONS.checkpoint_folder latest = tf.train.latest_checkpoint(load_path) print(latest) model.load_weights(latest) num_eval_examples = pred_dataset.num_examples_per_epoch() num_eval_steps = num_eval_examples // GB_OPTIONS.batch_size print(GB_OPTIONS.batch_size) pred = model.predict(pred_input_dataset, batch_size=GB_OPTIONS.batch_size, steps=num_eval_steps) lab = np.asarray(pred_dataset.data[1]) if hasattr(pred_dataset, 'ori_labels'): ori_lab = pred_dataset.ori_labels else: ori_lab = lab print(pred.shape) np.save('out_X', pred) np.save('out_labels', lab) np.save('out_ori_labels', ori_lab) return 'good'
def run_bert(strategy, input_meta_data, model_config, train_input_fn=None, eval_input_fn=None, init_checkpoint=None, custom_callbacks=None): """Run BERT training.""" if FLAGS.mode == 'export_only': # As Keras ModelCheckpoint callback used with Keras compile/fit() API # internally uses model.save_weights() to save checkpoints, we must # use model.load_weights() when Keras compile/fit() is used. export_classifier(FLAGS.model_export_path, input_meta_data, FLAGS.use_keras_compile_fit, model_config, FLAGS.model_dir) return if FLAGS.mode != 'train_and_eval': raise ValueError('Unsupported mode is specified: %s' % FLAGS.mode) # Enables XLA in Session Config. Should not be set for TPU. keras_utils.set_config_v2(FLAGS.enable_xla) performance.set_mixed_precision_policy(common_flags.dtype()) epochs = FLAGS.num_train_epochs train_data_size = input_meta_data['train_data_size'] steps_per_epoch = int(train_data_size / FLAGS.train_batch_size) warmup_steps = int(epochs * train_data_size * 0.1 / FLAGS.train_batch_size) eval_steps = int( math.ceil(input_meta_data['eval_data_size'] / FLAGS.eval_batch_size)) if not strategy: raise ValueError('Distribution strategy has not been specified.') if not custom_callbacks: custom_callbacks = [] if FLAGS.log_steps: custom_callbacks.append( keras_utils.TimeHistory(batch_size=FLAGS.train_batch_size, log_steps=FLAGS.log_steps, logdir=FLAGS.model_dir)) trained_model = run_bert_classifier( strategy, model_config, input_meta_data, FLAGS.model_dir, epochs, steps_per_epoch, FLAGS.steps_per_loop, eval_steps, warmup_steps, FLAGS.learning_rate, init_checkpoint or FLAGS.init_checkpoint, train_input_fn, eval_input_fn, run_eagerly=FLAGS.run_eagerly, use_keras_compile_fit=FLAGS.use_keras_compile_fit, custom_callbacks=custom_callbacks) if FLAGS.model_export_path: # As Keras ModelCheckpoint callback used with Keras compile/fit() API # internally uses model.save_weights() to save checkpoints, we must # use model.load_weights() when Keras compile/fit() is used. model_saving_utils.export_bert_model( FLAGS.model_export_path, model=trained_model, restore_model_using_load_weights=FLAGS.use_keras_compile_fit) return trained_model
def __init__(self, flags_obj): """Init function of TransformerMain. Args: flags_obj: Object containing parsed flag values, i.e., FLAGS. Raises: ValueError: if not using static batch for input data on TPU. """ self.flags_obj = flags_obj self.predict_model = None # Add flag-defined parameters to params object num_gpus = flags_core.get_num_gpus(flags_obj) self.params = params = misc.get_model_params(flags_obj.param_set, num_gpus) params["vocab_file"] = flags_obj.vocab_file params["data_train"] = flags_obj.data_train params["data_dev"] = flags_obj.data_dev params["num_gpus"] = num_gpus params["use_ctl"] = flags_obj.use_ctl # params["data_dir"] = flags_obj.data_dir params["model_dir"] = flags_obj.model_dir params["static_batch"] = flags_obj.static_batch # params["max_length"] = flags_obj.max_length params["decode_batch_size"] = flags_obj.decode_batch_size # params["decode_max_length"] = flags_obj.decode_max_length # params["padded_decode"] = flags_obj.padded_decode params["max_io_parallelism"] = (flags_obj.num_parallel_calls or tf.data.experimental.AUTOTUNE) params["use_synthetic_data"] = flags_obj.use_synthetic_data params["batch_size"] = flags_obj.batch_size or params[ "default_batch_size"] params["repeat_dataset"] = None params["dtype"] = flags_core.get_tf_dtype(flags_obj) params["enable_tensorboard"] = flags_obj.enable_tensorboard params["steps_between_evals"] = flags_obj.steps_between_evals params["enable_checkpointing"] = flags_obj.enable_checkpointing params["save_weights_only"] = flags_obj.save_weights_only self.distribution_strategy = distribute_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=num_gpus, all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs, tpu_address=flags_obj.tpu or "") if self.use_tpu: params[ "num_replicas"] = self.distribution_strategy.num_replicas_in_sync else: logging.info("Running transformer with num_gpus = %d", num_gpus) if self.distribution_strategy: logging.info("For training, using distribution strategy: %s", self.distribution_strategy) else: logging.info("Not using any distribution strategy.") performance.set_mixed_precision_policy(params["dtype"])
def run(flags_obj): """Run ResNet ImageNet training and eval loop using custom training loops. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ keras_utils.set_session_config(enable_xla=flags_obj.enable_xla) performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj), use_experimental_api=False) if tf.config.list_physical_devices('GPU'): if flags_obj.tf_gpu_thread_mode: keras_utils.set_gpu_thread_mode_and_count( per_gpu_thread_count=flags_obj.per_gpu_thread_count, gpu_thread_mode=flags_obj.tf_gpu_thread_mode, num_gpus=flags_obj.num_gpus, datasets_num_private_threads=flags_obj. datasets_num_private_threads) common.set_cudnn_batchnorm_mode() data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.config.list_physical_devices('GPU') else 'channels_last') tf.keras.backend.set_image_data_format(data_format) strategy = distribute_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_obj.num_gpus, all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs, tpu_address=flags_obj.tpu) per_epoch_steps, train_epochs, eval_steps = get_num_train_iterations( flags_obj) if flags_obj.steps_per_loop is None: steps_per_loop = per_epoch_steps elif flags_obj.steps_per_loop > per_epoch_steps: steps_per_loop = per_epoch_steps logging.warn('Setting steps_per_loop to %d to respect epoch boundary.', steps_per_loop) else: steps_per_loop = flags_obj.steps_per_loop logging.info( 'Training %d epochs, each epoch has %d steps, ' 'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps, train_epochs * per_epoch_steps, eval_steps) time_callback = keras_utils.TimeHistory( flags_obj.batch_size, flags_obj.log_steps, logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None) with distribute_utils.get_strategy_scope(strategy): runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback, per_epoch_steps) eval_interval = flags_obj.epochs_between_evals * per_epoch_steps checkpoint_interval = (steps_per_loop * 5 if flags_obj.enable_checkpoint_and_export else None) summary_interval = steps_per_loop if flags_obj.enable_tensorboard else None checkpoint_manager = tf.train.CheckpointManager( runnable.checkpoint, directory=flags_obj.model_dir, max_to_keep=10, step_counter=runnable.global_step, checkpoint_interval=checkpoint_interval) resnet_controller = orbit.Controller( strategy, runnable, runnable if not flags_obj.skip_eval else None, global_step=runnable.global_step, steps_per_loop=steps_per_loop, checkpoint_manager=checkpoint_manager, summary_interval=summary_interval, summary_dir=flags_obj.model_dir, eval_summary_dir=os.path.join(flags_obj.model_dir, 'eval')) time_callback.on_train_begin() if not flags_obj.skip_eval: resnet_controller.train_and_evaluate(train_steps=per_epoch_steps * train_epochs, eval_steps=eval_steps, eval_interval=eval_interval) else: resnet_controller.train(steps=per_epoch_steps * train_epochs) time_callback.on_train_end() stats = build_stats(runnable, time_callback) return stats
def train_squad(strategy, input_meta_data, bert_config, custom_callbacks=None, run_eagerly=False, init_checkpoint=None, sub_model_export_name=None): """Run bert squad training.""" if strategy: logging.info('Training using customized training loop with distribution' ' strategy.') # Enables XLA in Session Config. Should not be set for TPU. keras_utils.set_session_config(FLAGS.enable_xla) performance.set_mixed_precision_policy(common_flags.dtype()) epochs = FLAGS.num_train_epochs num_train_examples = input_meta_data['train_data_size'] max_seq_length = input_meta_data['max_seq_length'] steps_per_epoch = int(num_train_examples / FLAGS.train_batch_size) warmup_steps = int(epochs * num_train_examples * 0.1 / FLAGS.train_batch_size) train_input_fn = get_dataset_fn( FLAGS.train_data_path, max_seq_length, FLAGS.train_batch_size, is_training=True) def _get_squad_model(): """Get Squad model and optimizer.""" squad_model, core_model = bert_models.squad_model( bert_config, max_seq_length, hub_module_url=FLAGS.hub_module_url, hub_module_trainable=FLAGS.hub_module_trainable) optimizer = optimization.create_optimizer(FLAGS.learning_rate, steps_per_epoch * epochs, warmup_steps, FLAGS.end_lr, FLAGS.optimizer_type) squad_model.optimizer = performance.configure_optimizer( optimizer, use_float16=common_flags.use_float16(), use_graph_rewrite=common_flags.use_graph_rewrite()) return squad_model, core_model # Only when explicit_allreduce = True, post_allreduce_callbacks and # allreduce_bytes_per_pack will take effect. optimizer.apply_gradients() no # longer implicitly allreduce gradients, users manually allreduce gradient and # pass the allreduced grads_and_vars to apply_gradients(). # With explicit_allreduce = True, clip_by_global_norm is moved to after # allreduce. model_training_utils.run_customized_training_loop( strategy=strategy, model_fn=_get_squad_model, loss_fn=get_loss_fn(), model_dir=FLAGS.model_dir, steps_per_epoch=steps_per_epoch, steps_per_loop=FLAGS.steps_per_loop, epochs=epochs, train_input_fn=train_input_fn, init_checkpoint=init_checkpoint or FLAGS.init_checkpoint, sub_model_export_name=sub_model_export_name, run_eagerly=run_eagerly, custom_callbacks=custom_callbacks, explicit_allreduce=FLAGS.explicit_allreduce, pre_allreduce_callbacks=[ model_training_utils.clip_by_global_norm_callback ], allreduce_bytes_per_pack=FLAGS.allreduce_bytes_per_pack)
def run(flags_obj): """Run ResNet ImageNet training and eval loop using native Keras APIs. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. NotImplementedError: If some features are not currently supported. Returns: Dictionary of training and eval stats. """ keras_utils.set_session_config(enable_eager=flags_obj.enable_eager, enable_xla=flags_obj.enable_xla) # Execute flag override logic for better model performance if flags_obj.tf_gpu_thread_mode: keras_utils.set_gpu_thread_mode_and_count( per_gpu_thread_count=flags_obj.per_gpu_thread_count, gpu_thread_mode=flags_obj.tf_gpu_thread_mode, num_gpus=flags_obj.num_gpus, datasets_num_private_threads=flags_obj.datasets_num_private_threads ) common.set_cudnn_batchnorm_mode() dtype = flags_core.get_tf_dtype(flags_obj) performance.set_mixed_precision_policy( flags_core.get_tf_dtype(flags_obj), flags_core.get_loss_scale(flags_obj, default_for_fp16=128)) data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.config.list_physical_devices('GPU') else 'channels_last') tf.keras.backend.set_image_data_format(data_format) # Configures cluster spec for distribution strategy. _ = distribution_utils.configure_cluster(flags_obj.worker_hosts, flags_obj.task_index) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_obj.num_gpus, all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs, tpu_address=flags_obj.tpu) if strategy: # flags_obj.enable_get_next_as_optional controls whether enabling # get_next_as_optional behavior in DistributedIterator. If true, last # partial batch can be supported. strategy.extended.experimental_enable_get_next_as_optional = ( flags_obj.enable_get_next_as_optional) strategy_scope = distribution_utils.get_strategy_scope(strategy) # pylint: disable=protected-access if flags_obj.use_synthetic_data: distribution_utils.set_up_synthetic_data() input_fn = common.get_synth_input_fn( height=imagenet_preprocessing.DEFAULT_IMAGE_SIZE, width=imagenet_preprocessing.DEFAULT_IMAGE_SIZE, num_channels=imagenet_preprocessing.NUM_CHANNELS, num_classes=imagenet_preprocessing.NUM_CLASSES, dtype=dtype, drop_remainder=True) else: distribution_utils.undo_set_up_synthetic_data() input_fn = imagenet_preprocessing.input_fn # When `enable_xla` is True, we always drop the remainder of the batches # in the dataset, as XLA-GPU doesn't support dynamic shapes. drop_remainder = flags_obj.enable_xla # Current resnet_model.resnet50 input format is always channel-last. # We use keras_application mobilenet model which input format is depends on # the keras beckend image data format. # This use_keras_image_data_format flags indicates whether image preprocessor # output format should be same as the keras backend image data format or just # channel-last format. use_keras_image_data_format = (flags_obj.model == 'mobilenet') train_input_dataset = input_fn( is_training=True, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, parse_record_fn=imagenet_preprocessing.get_parse_record_fn( use_keras_image_data_format=use_keras_image_data_format), datasets_num_private_threads=flags_obj.datasets_num_private_threads, dtype=dtype, drop_remainder=drop_remainder, tf_data_experimental_slack=flags_obj.tf_data_experimental_slack, training_dataset_cache=flags_obj.training_dataset_cache, ) eval_input_dataset = None if not flags_obj.skip_eval: eval_input_dataset = input_fn( is_training=False, data_dir=flags_obj.data_dir, batch_size=flags_obj.batch_size, parse_record_fn=imagenet_preprocessing.get_parse_record_fn( use_keras_image_data_format=use_keras_image_data_format), dtype=dtype, drop_remainder=drop_remainder) lr_schedule = common.PiecewiseConstantDecayWithWarmup( batch_size=flags_obj.batch_size, epoch_size=imagenet_preprocessing.NUM_IMAGES['train'], warmup_epochs=common.LR_SCHEDULE[0][1], boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]), multipliers=list(p[0] for p in common.LR_SCHEDULE), compute_lr_on_cpu=True) steps_per_epoch = (imagenet_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size) with strategy_scope: if flags_obj.optimizer == 'resnet50_default': optimizer = common.get_optimizer(lr_schedule) elif flags_obj.optimizer == 'mobilenet_default': initial_learning_rate = \ flags_obj.initial_learning_rate_per_sample * flags_obj.batch_size optimizer = tf.keras.optimizers.SGD( learning_rate=tf.keras.optimizers.schedules.ExponentialDecay( initial_learning_rate, decay_steps=steps_per_epoch * flags_obj.num_epochs_per_decay, decay_rate=flags_obj.lr_decay_factor, staircase=True), momentum=0.9) if flags_obj.fp16_implementation == 'graph_rewrite': # Note: when flags_obj.fp16_implementation == "graph_rewrite", dtype as # determined by flags_core.get_tf_dtype(flags_obj) would be 'float32' # which will ensure tf.compat.v2.keras.mixed_precision and # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double # up. optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer) # TODO(hongkuny): Remove trivial model usage and move it to benchmark. if flags_obj.use_trivial_model: model = test_utils.trivial_model( imagenet_preprocessing.NUM_CLASSES) elif flags_obj.model == 'resnet50_v1.5': model = resnet_model.resnet50( num_classes=imagenet_preprocessing.NUM_CLASSES) elif flags_obj.model == 'mobilenet': # TODO(kimjaehong): Remove layers attribute when minimum TF version # support 2.0 layers by default. model = tf.keras.applications.mobilenet.MobileNet( weights=None, classes=imagenet_preprocessing.NUM_CLASSES, layers=tf.keras.layers) if flags_obj.pretrained_filepath: model.load_weights(flags_obj.pretrained_filepath) if flags_obj.pruning_method == 'polynomial_decay': if dtype != tf.float32: raise NotImplementedError( 'Pruning is currently only supported on dtype=tf.float32.') pruning_params = { 'pruning_schedule': tfmot.sparsity.keras.PolynomialDecay( initial_sparsity=flags_obj.pruning_initial_sparsity, final_sparsity=flags_obj.pruning_final_sparsity, begin_step=flags_obj.pruning_begin_step, end_step=flags_obj.pruning_end_step, frequency=flags_obj.pruning_frequency), } model = tfmot.sparsity.keras.prune_low_magnitude( model, **pruning_params) elif flags_obj.pruning_method: raise NotImplementedError( 'Only polynomial_decay is currently supported.') model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=(['sparse_categorical_accuracy'] if flags_obj.report_accuracy_metrics else None), run_eagerly=flags_obj.run_eagerly) train_epochs = flags_obj.train_epochs callbacks = common.get_callbacks( steps_per_epoch=steps_per_epoch, pruning_method=flags_obj.pruning_method, enable_checkpoint_and_export=flags_obj.enable_checkpoint_and_export, model_dir=flags_obj.model_dir) # if mutliple epochs, ignore the train_steps flag. if train_epochs <= 1 and flags_obj.train_steps: steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch) train_epochs = 1 num_eval_steps = (imagenet_preprocessing.NUM_IMAGES['validation'] // flags_obj.batch_size) validation_data = eval_input_dataset if flags_obj.skip_eval: # Only build the training graph. This reduces memory usage introduced by # control flow ops in layers that have different implementations for # training and inference (e.g., batch norm). if flags_obj.set_learning_phase_to_train: # TODO(haoyuzhang): Understand slowdown of setting learning phase when # not using distribution strategy. tf.keras.backend.set_learning_phase(1) num_eval_steps = None validation_data = None if not strategy and flags_obj.explicit_gpu_placement: # TODO(b/135607227): Add device scope automatically in Keras training loop # when not using distribition strategy. no_dist_strat_device = tf.device('/device:GPU:0') no_dist_strat_device.__enter__() history = model.fit(train_input_dataset, epochs=train_epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks, validation_steps=num_eval_steps, validation_data=validation_data, validation_freq=flags_obj.epochs_between_evals, verbose=2) eval_output = None if not flags_obj.skip_eval: eval_output = model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=2) if flags_obj.pruning_method: model = tfmot.sparsity.keras.strip_pruning(model) if flags_obj.enable_checkpoint_and_export: if dtype == tf.bfloat16: logging.warning( 'Keras model.save does not support bfloat16 dtype.') else: # Keras model.save assumes a float32 input designature. export_path = os.path.join(flags_obj.model_dir, 'saved_model') model.save(export_path, include_optimizer=False) if not strategy and flags_obj.explicit_gpu_placement: no_dist_strat_device.__exit__() stats = common.build_stats(history, eval_output, callbacks) return stats