def test_end_to_end_multi_eval(self, distribution_strategy, flag_mode): model_dir = self.get_temp_dir() experiment_config = configs.MultiEvalExperimentConfig( task=test_utils.FooConfig(), eval_tasks=(configs.TaskRoutine( task_name='foo', task_config=test_utils.FooConfig(), eval_steps=2), configs.TaskRoutine( task_name='bar', task_config=test_utils.BarConfig(), eval_steps=3))) experiment_config = params_dict.override_params_dict( experiment_config, self._test_config, is_strict=False) with distribution_strategy.scope(): train_task = task_factory.get_task(experiment_config.task) eval_tasks = [ task_factory.get_task(config.task_config, name=config.task_name) for config in experiment_config.eval_tasks ] train_lib.run_experiment_with_multitask_eval( distribution_strategy=distribution_strategy, train_task=train_task, eval_tasks=eval_tasks, mode=flag_mode, params=experiment_config, model_dir=model_dir)
def test_end_to_end(self, distribution_strategy, flag_mode, run_post_eval): model_dir = self.get_temp_dir() flags_dict = dict( experiment='mock', mode=flag_mode, model_dir=model_dir, params_override=json.dumps(self._test_config)) with flagsaver.flagsaver(**flags_dict): params = train_utils.parse_configuration(flags.FLAGS) train_utils.serialize_config(params, model_dir) with distribution_strategy.scope(): task = task_factory.get_task(params.task, logging_dir=model_dir) logs = train_lib.run_experiment( distribution_strategy=distribution_strategy, task=task, mode=flag_mode, params=params, model_dir=model_dir, run_post_eval=run_post_eval) if run_post_eval: self.assertNotEmpty(logs) else: self.assertEmpty(logs) self.assertNotEmpty( tf.io.gfile.glob(os.path.join(model_dir, 'params.yaml'))) if flag_mode != 'eval': self.assertNotEmpty( tf.io.gfile.glob(os.path.join(model_dir, 'checkpoint')))
def main(_): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) params = train_utils.parse_configuration(FLAGS) model_dir = FLAGS.model_dir if 'train' in FLAGS.mode: # Pure eval modes do not output yaml files. Otherwise continuous eval job # may race against the train job for writing the same file. train_utils.serialize_config(params, model_dir) # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16' # can have significant impact on model speeds by utilizing float16 in case of # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when # dtype is float16 if params.runtime.mixed_precision_dtype: performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype) distribution_strategy = distribute_utils.get_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, all_reduce_alg=params.runtime.all_reduce_alg, num_gpus=params.runtime.num_gpus, tpu_address=params.runtime.tpu) with distribution_strategy.scope(): task = task_factory.get_task(params.task, logging_dir=model_dir) train_lib.run_experiment( distribution_strategy=distribution_strategy, task=task, mode=FLAGS.mode, params=params, model_dir=model_dir) train_utils.save_gin_config(FLAGS.mode, model_dir)
def load_model(experiment="yolo_custom", config_path=[], model_dir=""): CFG = train_utils.ParseConfigOptions( experiment=experiment, config_file=config_path) params = train_utils.parse_configuration(CFG) if params.runtime.mixed_precision_dtype: performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype, params.runtime.loss_scale) task = task_factory.get_task(params.task, logging_dir=model_dir) model = task.build_model() if model_dir is not None and model_dir != "": optimizer = task.create_optimizer(params.trainer.optimizer_config, params.runtime) # optimizer = tf.keras.mixed_precision.LossScaleOptimizer(tf.keras.optimizers.SGD(), dynamic = True) ckpt = tf.train.Checkpoint(model=model, optimizer=optimizer) status = ckpt.restore(tf.train.latest_checkpoint(model_dir)) status.expect_partial().assert_existing_objects_matched() print(dir(status), status) else: task.initialize(model) return task, model
def build_experiment_model(experiment_type): """Builds model from experiment type configuration.""" params = exp_factory.get_exp_config(experiment_type) params.validate() params.lock() task = task_factory.get_task(params.task) return task.build_model()
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') superglue_flags.validate_flags(FLAGS, file_exists_fn=tf.io.gfile.exists) gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) distribution_strategy = distribute_utils.get_distribution_strategy( distribution_strategy=FLAGS.distribution_strategy, num_gpus=FLAGS.num_gpus, tpu_address=FLAGS.tpu) with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader: input_meta_data = json.loads(reader.read().decode('utf-8')) with distribution_strategy.scope(): task = None if 'train_eval' in FLAGS.mode: logging.info('Starting training and eval...') logging.info('Model dir: %s', FLAGS.model_dir) exp_config = _get_exp_config(input_meta_data=input_meta_data, exp_config_files=FLAGS.config_file) train_utils.serialize_config(exp_config, FLAGS.model_dir) task = task_factory.get_task(exp_config.task, logging_dir=FLAGS.model_dir) train_lib.run_experiment( distribution_strategy=distribution_strategy, task=task, mode='train_and_eval', params=exp_config, model_dir=FLAGS.model_dir) if 'predict' in FLAGS.mode: logging.info('Starting predict...') # When mode is `predict`, `task` will be None. if task is None: exp_config = _get_exp_config(input_meta_data=input_meta_data, exp_config_files=[ os.path.join( FLAGS.model_dir, 'params.yaml') ]) task = task_factory.get_task(exp_config.task, logging_dir=FLAGS.model_dir) _write_submission_file(task, input_meta_data['max_seq_length'])
def testContinuousFinetune(self): pretrain_steps = 1 src_model_dir = self.get_temp_dir() flags_dict = dict(experiment='mock', mode='continuous_train_and_eval', model_dir=self._model_dir, params_override={ 'task': { 'init_checkpoint': src_model_dir, }, 'trainer': { 'continuous_eval_timeout': 1, 'steps_per_loop': 1, 'train_steps': 1, 'validation_steps': 1, 'best_checkpoint_export_subdir': 'best_ckpt', 'best_checkpoint_eval_metric': 'acc', 'optimizer_config': { 'optimizer': { 'type': 'sgd' }, 'learning_rate': { 'type': 'constant' } } } }) with flagsaver.flagsaver(**flags_dict): # Train and save some checkpoints. params = train_utils.parse_configuration(flags.FLAGS) distribution_strategy = tf.distribute.get_strategy() with distribution_strategy.scope(): task = task_factory.get_task(params.task, logging_dir=src_model_dir) _ = train_lib.run_experiment( distribution_strategy=distribution_strategy, task=task, mode='train', params=params, model_dir=src_model_dir) params = train_utils.parse_configuration(FLAGS) eval_metrics = continuous_finetune_lib.run_continuous_finetune( FLAGS.mode, params, FLAGS.model_dir, run_post_eval=True, pretrain_steps=pretrain_steps) self.assertIn('best_acc', eval_metrics) self.assertFalse( tf.io.gfile.exists(os.path.join(FLAGS.model_dir, 'checkpoint')))
def main(_): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) params = train_utils.parse_configuration(FLAGS) model_dir = FLAGS.model_dir if 'train' in FLAGS.mode: # Pure eval modes do not output yaml files. Otherwise continuous eval job # may race against the train job for writing the same file. train_utils.serialize_config(params, model_dir) if 'train_and_eval' in FLAGS.mode: assert ( params.task.train_data.feature_shape == params.task.validation_data.feature_shape), ( f'train {params.task.train_data.feature_shape} != validate ' f'{params.task.validation_data.feature_shape}') if 'assemblenet' in FLAGS.experiment: if 'eval' in FLAGS.mode: # Use the feature shape in validation_data for all jobs. The number of # frames in train_data will be used to construct the Assemblenet model. params.task.model.backbone.assemblenet.num_frames = params.task.validation_data.feature_shape[ 0] shape = params.task.validation_data.feature_shape else: params.task.model.backbone.assemblenet.num_frames = params.task.train_data.feature_shape[ 0] shape = params.task.train_data.feature_shape logging.info('mode %r num_frames %r feature shape %r', FLAGS.mode, params.task.model.backbone.assemblenet.num_frames, shape) # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16' # can have significant impact on model speeds by utilizing float16 in case of # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when # dtype is float16 if params.runtime.mixed_precision_dtype: performance.set_mixed_precision_policy( params.runtime.mixed_precision_dtype) distribution_strategy = distribute_utils.get_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, all_reduce_alg=params.runtime.all_reduce_alg, num_gpus=params.runtime.num_gpus, tpu_address=params.runtime.tpu) with distribution_strategy.scope(): task = task_factory.get_task(params.task, logging_dir=model_dir) train_lib.run_experiment(distribution_strategy=distribution_strategy, task=task, mode=FLAGS.mode, params=params, model_dir=model_dir) train_utils.save_gin_config(FLAGS.mode, model_dir)
def from_config(cls, config: configs.MultiTaskConfig, logging_dir=None): tasks = {} task_eval_steps = {} task_weights = {} for task_routine in config.task_routines: task_name = task_routine.task_name tasks[task_name] = task_factory.get_task(task_routine.task_config, logging_dir=logging_dir) task_eval_steps[task_name] = task_routine.eval_steps task_weights[task_name] = task_routine.task_weight return cls(tasks, task_eval_steps=task_eval_steps, task_weights=task_weights)
def main(_): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) params = train_utils.parse_configuration(FLAGS) model_dir = FLAGS.model_dir if 'train' in FLAGS.mode: # Pure eval modes do not output yaml files. Otherwise continuous eval job # may race against the train job for writing the same file. train_utils.serialize_config(params, model_dir) # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16' # can have significant impact on model speeds by utilizing float16 in case of # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when # dtype is float16 if params.runtime.mixed_precision_dtype: performance.set_mixed_precision_policy( params.runtime.mixed_precision_dtype, params.runtime.loss_scale, use_experimental_api=True) input_partition_dims = None if FLAGS.mode == 'train_and_eval': if np.prod(params.task.train_input_partition_dims) != np.prod( params.task.eval_input_partition_dims): raise ValueError('Train and eval input partition dims can not be' 'partitioned on the same node') else: input_partition_dims = get_computation_shape_for_model_parallelism( params.task.train_input_partition_dims) elif FLAGS.mode == 'train': if params.task.train_input_partition_dims: input_partition_dims = get_computation_shape_for_model_parallelism( params.task.train_input_partition_dims) elif FLAGS.mode == 'eval' or FLAGS.mode == 'continuous_eval': if params.task.eval_input_partition_dims: input_partition_dims = get_computation_shape_for_model_parallelism( params.task.eval_input_partition_dims) distribution_strategy = create_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, num_gpus=params.runtime.num_gpus, input_partition_dims=input_partition_dims, tpu_address=params.runtime.tpu) with distribution_strategy.scope(): task = task_factory.get_task(params.task, logging_dir=model_dir) train_lib.run_experiment(distribution_strategy=distribution_strategy, task=task, mode=FLAGS.mode, params=params, model_dir=model_dir)
def _build_experiment_model(experiment_type): """Builds model from experiment type configuration w/o loading checkpoint. To reduce test latency and avoid unexpected errors (e.g. checkpoint files not exist in the dedicated path), we skip the checkpoint loading for the tests. Args: experiment_type: model type for the experiment. Returns: TF/Keras model for the task. """ params = exp_factory.get_exp_config(experiment_type) if 'deeplabv3plus_mobilenet_edgetpuv2' in experiment_type: params.task.model.backbone.mobilenet_edgetpu.pretrained_checkpoint_path = None if 'autoseg_edgetpu' in experiment_type: params.task.model.model_params.model_weights_path = None params.validate() params.lock() task = task_factory.get_task(params.task) return task.build_model()
def test_end_to_end(self, distribution_strategy, flag_mode, run_post_eval): model_dir = self.get_temp_dir() experiment_config = cfg.ExperimentConfig( trainer=prog_trainer_lib.ProgressiveTrainerConfig(), task=ProgTaskConfig()) experiment_config = params_dict.override_params_dict(experiment_config, self._test_config, is_strict=False) with distribution_strategy.scope(): task = task_factory.get_task(experiment_config.task, logging_dir=model_dir) _, logs = train_lib.run_experiment( distribution_strategy=distribution_strategy, task=task, mode=flag_mode, params=experiment_config, model_dir=model_dir, run_post_eval=run_post_eval) if run_post_eval: self.assertNotEmpty(logs) else: self.assertEmpty(logs) if flag_mode == 'eval': return self.assertNotEmpty( tf.io.gfile.glob(os.path.join(model_dir, 'checkpoint'))) # Tests continuous evaluation. _, logs = train_lib.run_experiment( distribution_strategy=distribution_strategy, task=task, mode='continuous_eval', params=experiment_config, model_dir=model_dir, run_post_eval=run_post_eval) print(logs)
def test_recovery(self, distribution_strategy, flag_mode): loss_threshold = 1.0 model_dir = self.get_temp_dir() flags_dict = dict(experiment='mock', mode=flag_mode, model_dir=model_dir, params_override=json.dumps(self._test_config)) with flagsaver.flagsaver(**flags_dict): params = train_utils.parse_configuration(flags.FLAGS) params.trainer.loss_upper_bound = loss_threshold params.trainer.recovery_max_trials = 1 train_utils.serialize_config(params, model_dir) with distribution_strategy.scope(): task = task_factory.get_task(params.task, logging_dir=model_dir) # Saves a checkpoint for reference. model = task.build_model() checkpoint = tf.train.Checkpoint(model=model) checkpoint_manager = tf.train.CheckpointManager( checkpoint, self.get_temp_dir(), max_to_keep=2) checkpoint_manager.save() before_weights = model.get_weights() def build_losses(labels, model_outputs, aux_losses=None): del labels, model_outputs return tf.constant([loss_threshold], tf.float32) + aux_losses task.build_losses = build_losses model, _ = train_lib.run_experiment( distribution_strategy=distribution_strategy, task=task, mode=flag_mode, params=params, model_dir=model_dir) after_weights = model.get_weights() for left, right in zip(before_weights, after_weights): self.assertAllEqual(left, right)
def run_continuous_finetune( mode: str, params: config_definitions.ExperimentConfig, model_dir: str, run_post_eval: bool = False, ) -> Mapping[str, Any]: """Run modes with continuous training. Currently only supports continuous_train_and_eval. Args: mode: A 'str', specifying the mode. continuous_train_and_eval - monitors a checkpoint directory. Once a new checkpoint is discovered, loads the checkpoint, finetune the model by training it (probably on another dataset or with another task), then evaluate the finetuned model. params: ExperimentConfig instance. model_dir: A 'str', a path to store model checkpoints and summaries. run_post_eval: Whether to run post eval once after training, metrics logs are returned. Returns: eval logs: returns eval metrics logs when run_post_eval is set to True, othewise, returns {}. """ assert mode == 'continuous_train_and_eval', ( 'Only continuous_train_and_eval is supported by continuous_finetune. ' 'Got mode: {}'.format(mode)) # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16' # can have significant impact on model speeds by utilizing float16 in case of # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when # dtype is float16 if params.runtime.mixed_precision_dtype: performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype, params.runtime.loss_scale) distribution_strategy = distribution_utils.get_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, all_reduce_alg=params.runtime.all_reduce_alg, num_gpus=params.runtime.num_gpus, tpu_address=params.runtime.tpu) retry_times = 0 while not tf.io.gfile.isdir(params.task.init_checkpoint): # Wait for the init_checkpoint directory to be created. if retry_times >= 60: raise ValueError( 'ExperimentConfig.task.init_checkpoint must be a directory for ' 'continuous_train_and_eval mode.') retry_times += 1 time.sleep(60) summary_writer = tf.summary.create_file_writer( os.path.join(model_dir, 'eval')) for pretrain_ckpt in tf.train.checkpoints_iterator( checkpoint_dir=params.task.init_checkpoint, min_interval_secs=10, timeout=params.trainer.continuous_eval_timeout): with distribution_strategy.scope(): global_step = train_utils.read_global_step_from_checkpoint(pretrain_ckpt) if params.trainer.best_checkpoint_export_subdir: best_ckpt_subdir = '{}_{}'.format( params.trainer.best_checkpoint_export_subdir, global_step) params_replaced = params.replace( task={'init_checkpoint': pretrain_ckpt}, trainer={'best_checkpoint_export_subdir': best_ckpt_subdir}) else: params_replaced = params.replace(task={'init_checkpoint': pretrain_ckpt}) params_replaced.lock() logging.info('Running finetuning with params: %s', params_replaced) with distribution_strategy.scope(): task = task_factory.get_task(params_replaced.task, logging_dir=model_dir) _, eval_metrics = train_lib.run_experiment( distribution_strategy=distribution_strategy, task=task, mode='train_and_eval', # replace params.task.init_checkpoint to make sure that we load # exactly this pretrain checkpoint. params=params_replaced, model_dir=model_dir, run_post_eval=True, save_summary=False) logging.info('Evaluation finished. Pretrain global_step: %d', global_step) train_utils.write_json_summary(model_dir, global_step, eval_metrics) if not os.path.basename(model_dir): # if model_dir.endswith('/') summary_grp = os.path.dirname(model_dir) + '_' + task.__class__.__name__ else: summary_grp = os.path.basename(model_dir) + '_' + task.__class__.__name__ summaries = {} for name, value in eval_metrics.items(): summaries[summary_grp + '/' + name] = value train_utils.write_summary(summary_writer, global_step, summaries) train_utils.remove_ckpts(model_dir) if run_post_eval: return eval_metrics return {}
def main(_): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) params = train_utils.parse_configuration(FLAGS) if params.runtime.num_hpus > 0: import os #TODO: remove when SW-49334 is fixed [SW-49404] os.environ["TF_DISABLE_EAGER_TO_FUNC_REWRITER"] = "1" from habana_frameworks.tensorflow import load_habana_module load_habana_module() if params.task.train_data.deterministic or params.task.validation_data.deterministic: import os os.environ['PYTHONHASHSEED'] = '0' os.environ['TF_DETERMINISTIC_OPS'] = '1' import numpy numpy.random.seed(0) import tensorflow as tf tf.random.set_seed(0) tf.compat.v1.set_random_seed(0) import random random.seed(0) if FLAGS.dtype == "bf16": print("Using bf16 config list {}".format(FLAGS.bf16_config_path)) os.environ['TF_BF16_CONVERSION'] = FLAGS.bf16_config_path hls_addresses = str(os.environ.get("MULTI_HLS_IPS", "127.0.0.1")).split(",") TF_BASE_PORT = 2410 mpi_rank = comm_rank() mpi_size = comm_size() if params.runtime.num_hpus > 1: model_dir = os.path.join(FLAGS.model_dir, "worker_" + str(mpi_rank)) else: model_dir = FLAGS.model_dir #prepare a comma-seperated list of device addreses worker_list = [] for address in hls_addresses: for rank in range(mpi_size // len(hls_addresses)): worker_list.append(address + ':' + str(TF_BASE_PORT + rank)) worker_hosts = ",".join(worker_list) task_index = mpi_rank # Configures cluster spec for distribution strategy. distribution_utils.configure_cluster(worker_hosts, task_index) if 'train' in FLAGS.mode: # Pure eval modes do not output yaml files. Otherwise continuous eval job # may race against the train job for writing the same file. train_utils.serialize_config(params, model_dir) # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16' # can have significant impact on model speeds by utilizing float16 in case of # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when # dtype is float16 if params.runtime.mixed_precision_dtype: performance.set_mixed_precision_policy( params.runtime.mixed_precision_dtype) distribution_strategy = distribution_utils.get_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, all_reduce_alg=params.runtime.all_reduce_alg, num_gpus=params.runtime.num_gpus, num_hpus=params.runtime.num_hpus, tpu_address=params.runtime.tpu) with distribution_strategy.scope(): task = task_factory.get_task(params.task, logging_dir=model_dir) train_lib.run_experiment(distribution_strategy=distribution_strategy, task=task, mode=FLAGS.mode, params=params, model_dir=model_dir) train_utils.save_gin_config(FLAGS.mode, model_dir)
def test_task_factory(self): config = exp_factory.get_exp_config('video_ssl_pretrain_kinetics600') task = task_factory.get_task(config.task) self.assertIs(type(task), pretrain.VideoSSLPretrainTask)
(name, self._task_weights.get(name, 1.0)) for name in self.tasks >>>>>>> upstream/master ]) @classmethod def from_config(cls, config: configs.MultiTaskConfig, logging_dir=None): tasks = {} task_eval_steps = {} <<<<<<< HEAD task_mixing_steps = {} ======= >>>>>>> upstream/master task_weights = {} for task_routine in config.task_routines: task_name = task_routine.task_name tasks[task_name] = task_factory.get_task( task_routine.task_config, logging_dir=logging_dir) task_eval_steps[task_name] = task_routine.eval_steps <<<<<<< HEAD task_mixing_steps[task_name] = task_routine.mixing_steps task_weights[task_name] = task_routine.task_weight return cls( tasks, task_mixing_steps=task_mixing_steps, ======= task_weights[task_name] = task_routine.task_weight return cls( tasks, >>>>>>> upstream/master task_eval_steps=task_eval_steps, task_weights=task_weights)
def run_continuous_finetune( mode: str, params: config_definitions.ExperimentConfig, model_dir: str, run_post_eval: bool = False, pretrain_steps: Optional[int] = None, ) -> Mapping[str, Any]: """Run modes with continuous training. Currently only supports continuous_train_and_eval. Args: mode: A 'str', specifying the mode. continuous_train_and_eval - monitors a checkpoint directory. Once a new checkpoint is discovered, loads the checkpoint, finetune the model by training it (probably on another dataset or with another task), then evaluate the finetuned model. params: ExperimentConfig instance. model_dir: A 'str', a path to store model checkpoints and summaries. run_post_eval: Whether to run post eval once after training, metrics logs are returned. pretrain_steps: Optional, the number of total training steps for the pretraining job. Returns: eval logs: returns eval metrics logs when run_post_eval is set to True, othewise, returns {}. """ assert mode == 'continuous_train_and_eval', ( 'Only continuous_train_and_eval is supported by continuous_finetune. ' 'Got mode: {}'.format(mode)) # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16' # can have significant impact on model speeds by utilizing float16 in case of # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when # dtype is float16 if params.runtime.mixed_precision_dtype: performance.set_mixed_precision_policy( params.runtime.mixed_precision_dtype, params.runtime.loss_scale) distribution_strategy = distribute_utils.get_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, all_reduce_alg=params.runtime.all_reduce_alg, num_gpus=params.runtime.num_gpus, tpu_address=params.runtime.tpu) retry_times = 0 while not tf.io.gfile.isdir(params.task.init_checkpoint): # Wait for the init_checkpoint directory to be created. if retry_times >= 60: raise ValueError( 'ExperimentConfig.task.init_checkpoint must be a directory for ' 'continuous_train_and_eval mode.') retry_times += 1 time.sleep(60) summary_writer = tf.summary.create_file_writer( os.path.join(model_dir, 'eval')) global_step = 0 def timeout_fn(): if pretrain_steps and global_step < pretrain_steps: # Keeps waiting for another timeout period. logging.info( 'Continue waiting for new checkpoint as current pretrain ' 'global_step=%d and target is %d.', global_step, pretrain_steps) return False # Quits the loop. return True for pretrain_ckpt in tf.train.checkpoints_iterator( checkpoint_dir=params.task.init_checkpoint, min_interval_secs=10, timeout=params.trainer.continuous_eval_timeout, timeout_fn=timeout_fn): with distribution_strategy.scope(): global_step = train_utils.read_global_step_from_checkpoint( pretrain_ckpt) # Replaces params.task.init_checkpoint to make sure that we load # exactly this pretrain checkpoint. if params.trainer.best_checkpoint_export_subdir: best_ckpt_subdir = '{}_{}'.format( params.trainer.best_checkpoint_export_subdir, global_step) params_replaced = params.replace( task={'init_checkpoint': pretrain_ckpt}, trainer={'best_checkpoint_export_subdir': best_ckpt_subdir}) else: params_replaced = params.replace( task={'init_checkpoint': pretrain_ckpt}) params_replaced.lock() logging.info('Running finetuning with params: %s', params_replaced) with distribution_strategy.scope(): if isinstance(params, configs.MultiEvalExperimentConfig): task = task_factory.get_task(params_replaced.task) eval_tasks = multitask.MultiTask.from_config( params_replaced.eval_tasks) (_, eval_metrics ) = multitask_train_lib.run_experiment_wtih_multitask_eval( distribution_strategy=distribution_strategy, train_task=task, eval_tasks=eval_tasks, mode='train_and_eval', params=params_replaced, model_dir=model_dir, run_post_eval=True, save_summary=False) else: task = task_factory.get_task(params_replaced.task, logging_dir=model_dir) _, eval_metrics = train_lib.run_experiment( distribution_strategy=distribution_strategy, task=task, mode='train_and_eval', params=params_replaced, model_dir=model_dir, run_post_eval=True, save_summary=False) logging.info('Evaluation finished. Pretrain global_step: %d', global_step) train_utils.write_json_summary(model_dir, global_step, eval_metrics) if not os.path.basename(model_dir): # if model_dir.endswith('/') summary_grp = os.path.dirname(model_dir) + '_' + task.name else: summary_grp = os.path.basename(model_dir) + '_' + task.name summaries = {} for name, value in _flatten_dict(eval_metrics).items(): summaries[summary_grp + '/' + '-'.join(name)] = value train_utils.write_summary(summary_writer, global_step, summaries) train_utils.remove_ckpts(model_dir) # In TF2, the resource life cycle is bound with the python object life # cycle. Force trigger python garbage collection here so those resources # can be deallocated in time, so it doesn't cause OOM when allocating new # objects. # TODO(b/169178664): Fix cycle reference in Keras model and revisit to see # if we need gc here. gc.collect() if run_post_eval: return eval_metrics return {}
# dtype is float16 if params.runtime.mixed_precision_dtype: performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype, <<<<<<< HEAD params.runtime.loss_scale) ======= params.runtime.loss_scale, use_experimental_api=True) >>>>>>> upstream/master distribution_strategy = distribute_utils.get_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, all_reduce_alg=params.runtime.all_reduce_alg, num_gpus=params.runtime.num_gpus, tpu_address=params.runtime.tpu) with distribution_strategy.scope(): task = task_factory.get_task(params.task, logging_dir=model_dir) train_lib.run_experiment( distribution_strategy=distribution_strategy, task=task, mode=FLAGS.mode, params=params, model_dir=model_dir) <<<<<<< HEAD ======= train_utils.save_gin_config(FLAGS.mode, model_dir) >>>>>>> upstream/master if __name__ == '__main__': tfm_flags.define_flags()