def main(_): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) params = train_utils.parse_configuration(FLAGS) model_dir = FLAGS.model_dir if "train" in FLAGS.mode: train_utils.serialize_config(params, model_dir) if params.runtime.mixed_precision_dtype: performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype) distribution_strategy = distribute_utils.get_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, all_reduce_alg=params.runtime.all_reduce_alg, num_gpus=params.runtime.num_gpus, tpu_address=params.runtime.tpu, **params.runtime.model_parallelism()) with distribution_strategy.scope(): if params.task.use_crf: task = ap_parsing_task.APParsingTaskCRF(params.task) else: task = ap_parsing_task.APParsingTaskBase(params.task) ckpt_exporter = train_utils.maybe_create_best_ckpt_exporter( params, model_dir) trainer = train_utils.create_trainer( params, task, train="train" in FLAGS.mode, evaluate=("eval" in FLAGS.mode), checkpoint_exporter=ckpt_exporter) model, _ = train_lib.run_experiment( distribution_strategy=distribution_strategy, task=task, mode=FLAGS.mode, params=params, trainer=trainer, model_dir=model_dir) train_utils.save_gin_config(FLAGS.mode, model_dir) # Export saved model. if "train" in FLAGS.mode: saved_model_path = os.path.join(model_dir, "saved_models/latest") logging.info("Exporting SavedModel to %s", saved_model_path) tf.saved_model.save(model, saved_model_path) if ckpt_exporter: logging.info("Loading best checkpoint for export") trainer.checkpoint.restore(ckpt_exporter.best_ckpt_path) saved_model_path = os.path.join(model_dir, "saved_models/best") # Make sure restored and not re-initialized. if trainer.global_step > 0: logging.info( "Exporting best saved model by %s (from global step: %d) to %s", params.trainer.best_checkpoint_eval_metric, trainer.global_step.numpy(), saved_model_path) tf.saved_model.save(trainer.model, saved_model_path)
def _build_trainer(self, task: base_task.Task, train: bool, evaluate: bool) -> base_trainer.Trainer: """Create trainer.""" with self.strategy.scope(): trainer = train_utils.create_trainer( self.params, task, train=train, evaluate=evaluate, checkpoint_exporter=self._build_best_checkpoint_exporter()) return trainer
def run_experiment(distribution_strategy: tf.distribute.Strategy, task: base_task.Task, mode: str, params: config_definitions.ExperimentConfig, model_dir: str, run_post_eval: bool = False, save_summary: bool = True) \ -> Tuple[tf.keras.Model, Mapping[str, Any]]: """Runs train/eval configured by the experiment params. Args: distribution_strategy: A distribution distribution_strategy. task: A Task instance. mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval' or 'continuous_eval'. params: ExperimentConfig instance. model_dir: A 'str', a path to store model checkpoints and summaries. run_post_eval: Whether to run post eval once after training, metrics logs are returned. save_summary: Whether to save train and validation summary. Returns: A 2-tuple of (model, eval_logs). model: `tf.keras.Model` instance. eval_logs: returns eval metrics logs when run_post_eval is set to True, otherwise, returns {}. """ with distribution_strategy.scope(): trainer = train_utils.create_trainer( params, task, model_dir=model_dir, train='train' in mode, evaluate=('eval' in mode) or run_post_eval, checkpoint_exporter=maybe_create_best_ckpt_exporter( params, model_dir)) if trainer.checkpoint: checkpoint_manager = tf.train.CheckpointManager( trainer.checkpoint, directory=model_dir, max_to_keep=params.trainer.max_to_keep, step_counter=trainer.global_step, checkpoint_interval=params.trainer.checkpoint_interval, init_fn=trainer.initialize) else: checkpoint_manager = None controller = orbit.Controller( distribution_strategy, trainer=trainer if 'train' in mode else None, evaluator=trainer, global_step=trainer.global_step, steps_per_loop=params.trainer.steps_per_loop, checkpoint_manager=checkpoint_manager, summary_dir=os.path.join(model_dir, 'train') if (save_summary) else None, eval_summary_dir=os.path.join(model_dir, 'validation') if (save_summary) else None, summary_interval=params.trainer.summary_interval if (save_summary) else None) logging.info('Starts to execute mode: %s', mode) with distribution_strategy.scope(): if mode == 'train': controller.train(steps=params.trainer.train_steps) elif mode == 'train_and_eval': controller.train_and_evaluate( train_steps=params.trainer.train_steps, eval_steps=params.trainer.validation_steps, eval_interval=params.trainer.validation_interval) elif mode == 'eval': controller.evaluate(steps=params.trainer.validation_steps) elif mode == 'continuous_eval': def timeout_fn(): if trainer.global_step.numpy() >= params.trainer.train_steps: return True return False controller.evaluate_continuously( steps=params.trainer.validation_steps, timeout=params.trainer.continuous_eval_timeout, timeout_fn=timeout_fn) else: raise NotImplementedError('The mode is not implemented: %s' % mode) if run_post_eval: with distribution_strategy.scope(): return trainer.model, trainer.evaluate( tf.convert_to_tensor(params.trainer.validation_steps)) else: return trainer.model, {}
def run_experiment( distribution_strategy: tf.distribute.Strategy, task: base_task.Task, mode: str, params: config_definitions.ExperimentConfig, model_dir: str, run_post_eval: bool = False, save_summary: bool = True, trainer: Optional[base_trainer.Trainer] = None, controller_cls=orbit.Controller ) -> Tuple[tf.keras.Model, Mapping[str, Any]]: """Runs train/eval configured by the experiment params. Args: distribution_strategy: A distribution distribution_strategy. task: A Task instance. mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval' or 'continuous_eval'. params: ExperimentConfig instance. model_dir: A 'str', a path to store model checkpoints and summaries. run_post_eval: Whether to run post eval once after training, metrics logs are returned. save_summary: Whether to save train and validation summary. trainer: the base_trainer.Trainer instance. It should be created within the strategy.scope(). controller_cls: The controller class to manage the train and eval process. Must be a orbit.Controller subclass. Returns: A 2-tuple of (model, eval_logs). model: `tf.keras.Model` instance. eval_logs: returns eval metrics logs when run_post_eval is set to True, otherwise, returns {}. """ with distribution_strategy.scope(): if not trainer: trainer = train_utils.create_trainer( params, task, train='train' in mode, evaluate=('eval' in mode) or run_post_eval, checkpoint_exporter=maybe_create_best_ckpt_exporter( params, model_dir)) if trainer.checkpoint: if model_dir is None: raise ValueError('model_dir must be specified, but got None') checkpoint_manager = tf.train.CheckpointManager( trainer.checkpoint, directory=model_dir, max_to_keep=params.trainer.max_to_keep, step_counter=trainer.global_step, checkpoint_interval=params.trainer.checkpoint_interval, init_fn=trainer.initialize) else: checkpoint_manager = None controller = controller_cls( strategy=distribution_strategy, trainer=trainer if 'train' in mode else None, evaluator=trainer, global_step=trainer.global_step, steps_per_loop=params.trainer.steps_per_loop, checkpoint_manager=checkpoint_manager, summary_dir=os.path.join(model_dir, 'train') if (save_summary) else None, eval_summary_dir=os.path.join( model_dir, params.trainer.validation_summary_subdir) if (save_summary) else None, summary_interval=params.trainer.summary_interval if (save_summary) else None, train_actions=actions.get_train_actions( params, trainer, model_dir, checkpoint_manager=checkpoint_manager), eval_actions=actions.get_eval_actions(params, trainer, model_dir)) logging.info('Starts to execute mode: %s', mode) with distribution_strategy.scope(): if mode == 'train': controller.train(steps=params.trainer.train_steps) elif mode == 'train_and_eval': controller.train_and_evaluate( train_steps=params.trainer.train_steps, eval_steps=params.trainer.validation_steps, eval_interval=params.trainer.validation_interval) elif mode == 'eval': controller.evaluate(steps=params.trainer.validation_steps) elif mode == 'continuous_eval': def timeout_fn(): if trainer.global_step.numpy() >= params.trainer.train_steps: return True return False controller.evaluate_continuously( steps=params.trainer.validation_steps, timeout=params.trainer.continuous_eval_timeout, timeout_fn=timeout_fn) else: raise NotImplementedError('The mode is not implemented: %s' % mode) num_params = train_utils.try_count_params(trainer.model) if num_params is not None: logging.info('Number of trainable params in model: %f Millions.', num_params / 10.**6) flops = train_utils.try_count_flops(trainer.model) if flops is not None: logging.info('FLOPs (multi-adds) in model: %f Billions.', flops / 10.**9 / 2) if run_post_eval: with distribution_strategy.scope(): return trainer.model, trainer.evaluate( tf.convert_to_tensor(params.trainer.validation_steps)) else: return trainer.model, {}
def run_experiment( distribution_strategy: tf.distribute.Strategy, task: base_task.Task, mode: str, params: config_definitions.ExperimentConfig, model_dir: str, run_post_eval: bool = False, save_summary: bool = True, trainer: Optional[base_trainer.Trainer] = None ) -> Tuple[tf.keras.Model, Mapping[str, Any]]: """Runs train/eval configured by the experiment params. Args: distribution_strategy: A distribution distribution_strategy. task: A Task instance. mode: A 'str', specifying the mode. Can be 'train', 'eval', 'train_and_eval' or 'continuous_eval'. params: ExperimentConfig instance. model_dir: A 'str', a path to store model checkpoints and summaries. run_post_eval: Whether to run post eval once after training, metrics logs are returned. save_summary: Whether to save train and validation summary. trainer: the base_trainer.Trainer instance. It should be created within the strategy.scope(). Returns: A 2-tuple of (model, eval_logs). model: `tf.keras.Model` instance. eval_logs: returns eval metrics logs when run_post_eval is set to True, otherwise, returns {}. """ with distribution_strategy.scope(): if not trainer: trainer = train_utils.create_trainer( params, task, train='train' in mode, evaluate=('eval' in mode) or run_post_eval, checkpoint_exporter=maybe_create_best_ckpt_exporter( params, model_dir)) if trainer.checkpoint: checkpoint_manager = tf.train.CheckpointManager( trainer.checkpoint, directory=model_dir, max_to_keep=params.trainer.max_to_keep, step_counter=trainer.global_step, checkpoint_interval=params.trainer.checkpoint_interval, init_fn=trainer.initialize) # Adds recovery handling. trainer.add_recovery(params.trainer, checkpoint_manager=checkpoint_manager) else: checkpoint_manager = None #Create logs matching tensorboard log parser format #see tensorboard_for_parser.md hparams = { "batch_size": params.task.train_data.global_batch_size, "precision": params.runtime.mixed_precision_dtype } controller = orbit.Controller( strategy=distribution_strategy, trainer=trainer if 'train' in mode else None, evaluator=trainer, global_step=trainer.global_step, steps_per_loop=params.trainer.steps_per_loop, checkpoint_manager=checkpoint_manager, summary_dir=model_dir if (save_summary) else None, eval_summary_dir=os.path.join( model_dir, params.trainer.validation_summary_subdir) if (save_summary) else None, summary_interval=params.trainer.summary_interval if (save_summary) else None, hparams=hparams if (save_summary) else None, train_actions=None, eval_actions=actions.get_eval_actions(params, trainer, model_dir)) logging.info('Starts to execute mode: %s', mode) with distribution_strategy.scope(): if (params.runtime.dump_config): from TensorFlow.common.debug import dump_callback with dump_callback( params.runtime.dump_config ) if params.runtime.dump_config else contextlib.ExitStack(): if mode == 'train': controller.train(steps=params.trainer.train_steps) elif mode == 'train_and_eval': controller.train_and_evaluate( train_steps=params.trainer.train_steps, eval_steps=params.trainer.validation_steps, eval_interval=params.trainer.validation_interval) elif mode == 'eval': controller.evaluate(steps=params.trainer.validation_steps) elif mode == 'continuous_eval': def timeout_fn(): if trainer.global_step.numpy( ) >= params.trainer.train_steps: return True return False controller.evaluate_continuously( steps=params.trainer.validation_steps, timeout=params.trainer.continuous_eval_timeout, timeout_fn=timeout_fn) else: raise NotImplementedError('The mode is not implemented: %s' % mode) num_params = train_utils.try_count_params(trainer.model) if num_params is not None: logging.info('Number of trainable params in model: %f Millions.', num_params / 10.**6) if run_post_eval: with distribution_strategy.scope(): return trainer.model, trainer.evaluate( tf.convert_to_tensor(params.trainer.validation_steps)) else: return trainer.model, {}