def test_mwms(self): distribute_utils.configure_cluster(worker_hosts=None, task_index=-1) ds = distribute_utils.get_distribution_strategy( 'multi_worker_mirrored', all_reduce_alg='nccl') self.assertIsInstance( ds, tf.distribute.experimental.MultiWorkerMirroredStrategy) with self.assertRaisesRegex( ValueError, 'When used with `multi_worker_mirrored`, valid values.*'): _ = distribute_utils.get_distribution_strategy( 'multi_worker_mirrored', all_reduce_alg='dummy')
def main(_): with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader: input_meta_data = json.loads(reader.read().decode('utf-8')) if FLAGS.mode == 'export_only': export_squad(FLAGS.model_export_path, input_meta_data) return # Configures cluster spec for multi-worker distribution strategy. if FLAGS.num_gpus > 0: _ = distribute_utils.configure_cluster(FLAGS.worker_hosts, FLAGS.task_index) strategy = distribute_utils.get_distribution_strategy( distribution_strategy=FLAGS.distribution_strategy, num_gpus=FLAGS.num_gpus, all_reduce_alg=FLAGS.all_reduce_alg, tpu_address=FLAGS.tpu) if 'train' in FLAGS.mode: train_squad(strategy, input_meta_data, run_eagerly=FLAGS.run_eagerly) if 'predict' in FLAGS.mode: predict_squad(strategy, input_meta_data) if 'eval' in FLAGS.mode: eval_metrics = eval_squad(strategy, input_meta_data) f1_score = eval_metrics['final_f1'] logging.info('SQuAD eval F1-score: %f', f1_score) summary_dir = os.path.join(FLAGS.model_dir, 'summaries', 'eval') summary_writer = tf.summary.create_file_writer(summary_dir) with summary_writer.as_default(): # TODO(lehou): write to the correct step number. tf.summary.scalar('F1-score', f1_score, step=0) summary_writer.flush() # Also write eval_metrics to json file. squad_lib_sp.write_to_json_files( eval_metrics, os.path.join(summary_dir, 'eval_metrics.json')) time.sleep(60)
def main(_): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param) if not FLAGS.model_dir: FLAGS.model_dir = '/tmp/bert20/' # Configures cluster spec for multi-worker distribution strategy. if FLAGS.num_gpus > 0: _ = distribute_utils.configure_cluster(FLAGS.worker_hosts, FLAGS.task_index) strategy = distribute_utils.get_distribution_strategy( distribution_strategy=FLAGS.distribution_strategy, num_gpus=FLAGS.num_gpus, all_reduce_alg=FLAGS.all_reduce_alg, tpu_address=FLAGS.tpu) if strategy: print('***** Number of cores used : ', strategy.num_replicas_in_sync) run_bert_pretrain(strategy)
def __init__(self, strategy_type=None, strategy_config=None): _ = distribute_utils.configure_cluster(strategy_config.worker_hosts, strategy_config.task_index) """Constructor. Args: strategy_type: string. One of 'tpu', 'mirrored', 'multi_worker_mirrored'. If None, the user is responsible to set the strategy before calling build_executor(...). strategy_config: necessary config for constructing the proper Strategy. Check strategy_flags_dict() for examples of the structure. """ self._strategy = distribute_utils.get_distribution_strategy( distribution_strategy=strategy_type, num_gpus=strategy_config.num_gpus, all_reduce_alg=strategy_config.all_reduce_alg, num_packs=strategy_config.num_packs, tpu_address=strategy_config.tpu)
def main(_): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) print(FLAGS.experiment) params = train_utils.parse_configuration(FLAGS) model_dir = FLAGS.model_dir if 'train' in FLAGS.mode: # Pure eval modes do not output yaml files. Otherwise continuous eval job # may race against the train job for writing the same file. train_utils.serialize_config(params, model_dir) # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16' # can have significant impact on model speeds by utilizing float16 in case of # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when # dtype is float16 if params.runtime.mixed_precision_dtype: performance.set_mixed_precision_policy( params.runtime.mixed_precision_dtype, params.runtime.loss_scale) if params.runtime.worker_hosts != '' and params.runtime.worker_hosts is not None: num_workers = distribute_utils.configure_cluster( worker_hosts=params.runtime.worker_hosts, task_index=params.runtime.task_index) print(num_workers) distribution_strategy = distribute_utils.get_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, all_reduce_alg=params.runtime.all_reduce_alg, num_gpus=params.runtime.num_gpus, tpu_address=params.runtime.tpu) with distribution_strategy.scope(): task = task_factory.get_task(params.task, logging_dir=model_dir) train_lib.run_experiment(distribution_strategy=distribution_strategy, task=task, mode=FLAGS.mode, params=params, model_dir=model_dir)
def run_executor(params, mode, checkpoint_path=None, train_input_fn=None, eval_input_fn=None, callbacks=None, prebuilt_strategy=None): """Runs the object detection model on distribution strategy defined by the user.""" if params.architecture.use_bfloat16: policy = tf.compat.v2.keras.mixed_precision.experimental.Policy( 'mixed_bfloat16') tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy) model_builder = model_factory.model_generator(params) if prebuilt_strategy is not None: strategy = prebuilt_strategy else: strategy_config = params.strategy_config distribute_utils.configure_cluster(strategy_config.worker_hosts, strategy_config.task_index) strategy = distribute_utils.get_distribution_strategy( distribution_strategy=params.strategy_type, num_gpus=strategy_config.num_gpus, all_reduce_alg=strategy_config.all_reduce_alg, num_packs=strategy_config.num_packs, tpu_address=strategy_config.tpu) num_workers = int(strategy.num_replicas_in_sync + 7) // 8 is_multi_host = (int(num_workers) >= 2) if mode == 'train': def _model_fn(params): return model_builder.build_model(params, mode=ModeKeys.TRAIN) logging.info( 'Train num_replicas_in_sync %d num_workers %d is_multi_host %s', strategy.num_replicas_in_sync, num_workers, is_multi_host) dist_executor = DetectionDistributedExecutor( strategy=strategy, params=params, model_fn=_model_fn, loss_fn=model_builder.build_loss_fn, is_multi_host=is_multi_host, predict_post_process_fn=model_builder.post_processing, trainable_variables_filter=model_builder. make_filter_trainable_variables_fn()) if is_multi_host: train_input_fn = functools.partial( train_input_fn, batch_size=params.train.batch_size // strategy.num_replicas_in_sync) return dist_executor.train( train_input_fn=train_input_fn, model_dir=params.model_dir, iterations_per_loop=params.train.iterations_per_loop, total_steps=params.train.total_steps, init_checkpoint=model_builder.make_restore_checkpoint_fn(), custom_callbacks=callbacks, save_config=True) elif mode == 'eval' or mode == 'eval_once': def _model_fn(params): return model_builder.build_model(params, mode=ModeKeys.PREDICT_WITH_GT) logging.info( 'Eval num_replicas_in_sync %d num_workers %d is_multi_host %s', strategy.num_replicas_in_sync, num_workers, is_multi_host) if is_multi_host: eval_input_fn = functools.partial( eval_input_fn, batch_size=params.eval.batch_size // strategy.num_replicas_in_sync) dist_executor = DetectionDistributedExecutor( strategy=strategy, params=params, model_fn=_model_fn, loss_fn=model_builder.build_loss_fn, is_multi_host=is_multi_host, predict_post_process_fn=model_builder.post_processing, trainable_variables_filter=model_builder. make_filter_trainable_variables_fn()) if mode == 'eval': results = dist_executor.evaluate_from_model_dir( model_dir=params.model_dir, eval_input_fn=eval_input_fn, eval_metric_fn=model_builder.eval_metrics, eval_timeout=params.eval.eval_timeout, min_eval_interval=params.eval.min_eval_interval, total_steps=params.train.total_steps) else: # Run evaluation once for a single checkpoint. if not checkpoint_path: raise ValueError('checkpoint_path cannot be empty.') if tf.io.gfile.isdir(checkpoint_path): checkpoint_path = tf.train.latest_checkpoint(checkpoint_path) summary_writer = executor.SummaryWriter(params.model_dir, 'eval') results, _ = dist_executor.evaluate_checkpoint( checkpoint_path=checkpoint_path, eval_input_fn=eval_input_fn, eval_metric_fn=model_builder.eval_metrics, summary_writer=summary_writer) for k, v in results.items(): logging.info('Final eval metric %s: %f', k, v) return results else: raise ValueError('Mode not found: %s.' % mode)
def train_and_eval( params: base_configs.ExperimentConfig, strategy_override: tf.distribute.Strategy) -> Mapping[str, Any]: """Runs the train and eval path using compile/fit.""" logging.info('Running train and eval.') distribute_utils.configure_cluster(params.runtime.worker_hosts, params.runtime.task_index) # Note: for TPUs, strategy and scope should be created before the dataset strategy = strategy_override or distribute_utils.get_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, all_reduce_alg=params.runtime.all_reduce_alg, num_gpus=params.runtime.num_gpus, tpu_address=params.runtime.tpu) strategy_scope = distribute_utils.get_strategy_scope(strategy) logging.info('Detected %d devices.', strategy.num_replicas_in_sync if strategy else 1) label_smoothing = params.model.loss.label_smoothing one_hot = label_smoothing and label_smoothing > 0 builders = _get_dataset_builders(params, strategy, one_hot) datasets = [ builder.build(strategy) if builder else None for builder in builders ] # Unpack datasets and builders based on train/val/test splits train_builder, validation_builder = builders # pylint: disable=unbalanced-tuple-unpacking train_dataset, validation_dataset = datasets train_epochs = params.train.epochs train_steps = params.train.steps or train_builder.num_steps validation_steps = params.evaluation.steps or validation_builder.num_steps initialize(params, train_builder) logging.info('Global batch size: %d', train_builder.global_batch_size) with strategy_scope: model_params = params.model.model_params.as_dict() model = get_models()[params.model.name](**model_params) learning_rate = optimizer_factory.build_learning_rate( params=params.model.learning_rate, batch_size=train_builder.global_batch_size, train_epochs=train_epochs, train_steps=train_steps) optimizer = optimizer_factory.build_optimizer( optimizer_name=params.model.optimizer.name, base_learning_rate=learning_rate, params=params.model.optimizer.as_dict(), model=model) optimizer = performance.configure_optimizer( optimizer, use_float16=train_builder.dtype == 'float16', loss_scale=get_loss_scale(params)) metrics_map = _get_metrics(one_hot) metrics = [metrics_map[metric] for metric in params.train.metrics] steps_per_loop = train_steps if params.train.set_epoch_loop else 1 if one_hot: loss_obj = tf.keras.losses.CategoricalCrossentropy( label_smoothing=params.model.loss.label_smoothing) else: loss_obj = tf.keras.losses.SparseCategoricalCrossentropy() model.compile( optimizer=optimizer, loss=loss_obj, metrics=metrics, steps_per_execution=steps_per_loop) initial_epoch = 0 if params.train.resume_checkpoint: initial_epoch = resume_from_checkpoint( model=model, model_dir=params.model_dir, train_steps=train_steps) callbacks = custom_callbacks.get_callbacks( model_checkpoint=params.train.callbacks.enable_checkpoint_and_export, include_tensorboard=params.train.callbacks.enable_tensorboard, time_history=params.train.callbacks.enable_time_history, track_lr=params.train.tensorboard.track_lr, write_model_weights=params.train.tensorboard.write_model_weights, initial_step=initial_epoch * train_steps, batch_size=train_builder.global_batch_size, log_steps=params.train.time_history.log_steps, model_dir=params.model_dir, backup_and_restore=params.train.callbacks.enable_backup_and_restore) serialize_config(params=params, model_dir=params.model_dir) if params.evaluation.skip_eval: validation_kwargs = {} else: validation_kwargs = { 'validation_data': validation_dataset, 'validation_steps': validation_steps, 'validation_freq': params.evaluation.epochs_between_evals, } history = model.fit( train_dataset, epochs=train_epochs, steps_per_epoch=train_steps, initial_epoch=initial_epoch, callbacks=callbacks, verbose=2, **validation_kwargs) validation_output = None if not params.evaluation.skip_eval: validation_output = model.evaluate( validation_dataset, steps=validation_steps, verbose=2) # TODO(dankondratyuk): eval and save final test accuracy stats = common.build_stats(history, validation_output, callbacks) return stats
def main(_): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) params = train_utils.parse_configuration(FLAGS) if params.runtime.num_hpus > 0: import os #TODO: remove when SW-49334 is fixed [SW-49404] os.environ["TF_DISABLE_EAGER_TO_FUNC_REWRITER"] = "1" from habana_frameworks.tensorflow import load_habana_module load_habana_module() if params.task.train_data.deterministic or params.task.validation_data.deterministic: import os os.environ['PYTHONHASHSEED'] = '0' os.environ['TF_DETERMINISTIC_OPS'] = '1' import numpy numpy.random.seed(0) import tensorflow as tf tf.random.set_seed(0) tf.compat.v1.set_random_seed(0) import random random.seed(0) if FLAGS.dtype == "bf16": print("Using bf16 config list {}".format(FLAGS.bf16_config_path)) os.environ['TF_BF16_CONVERSION'] = FLAGS.bf16_config_path hls_addresses = str(os.environ.get("MULTI_HLS_IPS", "127.0.0.1")).split(",") TF_BASE_PORT = 2410 mpi_rank = comm_rank() mpi_size = comm_size() if params.runtime.num_hpus > 1: model_dir = os.path.join(FLAGS.model_dir, "worker_" + str(mpi_rank)) else: model_dir = FLAGS.model_dir #prepare a comma-seperated list of device addreses worker_list = [] for address in hls_addresses: for rank in range(mpi_size // len(hls_addresses)): worker_list.append(address + ':' + str(TF_BASE_PORT + rank)) worker_hosts = ",".join(worker_list) task_index = mpi_rank # Configures cluster spec for distribution strategy. distribution_utils.configure_cluster(worker_hosts, task_index) if 'train' in FLAGS.mode: # Pure eval modes do not output yaml files. Otherwise continuous eval job # may race against the train job for writing the same file. train_utils.serialize_config(params, model_dir) # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16' # can have significant impact on model speeds by utilizing float16 in case of # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when # dtype is float16 if params.runtime.mixed_precision_dtype: performance.set_mixed_precision_policy( params.runtime.mixed_precision_dtype) distribution_strategy = distribution_utils.get_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, all_reduce_alg=params.runtime.all_reduce_alg, num_gpus=params.runtime.num_gpus, num_hpus=params.runtime.num_hpus, tpu_address=params.runtime.tpu) with distribution_strategy.scope(): task = task_factory.get_task(params.task, logging_dir=model_dir) train_lib.run_experiment(distribution_strategy=distribution_strategy, task=task, mode=FLAGS.mode, params=params, model_dir=model_dir) train_utils.save_gin_config(FLAGS.mode, model_dir)
def test_mwms(self): distribute_utils.configure_cluster(worker_hosts=None, task_index=-1) ds = distribute_utils.get_distribution_strategy( 'multi_worker_mirrored', all_reduce_alg='nccl') self.assertIsInstance( ds, tf.distribute.experimental.MultiWorkerMirroredStrategy)