def main(_): common.initialize_preloading() if flags.FLAGS.use_horovod and flags.FLAGS.distribution_strategy != "off": raise RuntimeError( "Horovod and distribution strategy cannot be used together. Please select one of the scaleout methods." ) if flags.FLAGS.distribution_strategy not in ["off", "hpu"]: raise RuntimeError( "Currently HPU supports only HPUStrategy, please set --distribution_strategy=hpu or use horovod" ) if flags.FLAGS.use_horovod: if flags.FLAGS.horovod_hierarchical_allreduce: os.environ['HOROVOD_HIERARCHICAL_ALLREDUCE'] = "1" hvd_init() else: synapse_logger_init() load_habana_module() if flags.FLAGS.global_seed: tf.random.set_seed(flags.FLAGS.global_seed) with dump_callback(): model_helpers.apply_clean(flags.FLAGS) with logger.benchmark_context(flags.FLAGS): stats = run(flags.FLAGS) logging.info('Run stats:\n%s', stats)
def main(): if not args.no_hpu: # Load Habana module in order to do inference on HPU (Gaudi) from habana_frameworks.tensorflow import load_habana_module load_habana_module() checkpoint_path = os.path.join(args.model_dir, 'checkpoints') model = T5.from_pretrained(checkpoint_path) tokenizer = transformers.AutoTokenizer.from_pretrained( os.path.join(args.data_dir, 't5_base', 'tokenizer')) print('\nProvide context and ask model a question, for example:') context = ( 'In 2019 Habana Labs announced its first AI accelerator. Gaudi, ' 'named after famous Catalan architect, was designed to accelerate ' 'training of deep neural networks in data centers.') question = 'What is the name of the chip?' print('Context:', context) print('Question:', question) print("Answer: ", answer(model, tokenizer, context, question)) while True: print( '\nProvide context and ask model a question (to exit use Ctrl+C)') context = input('Context: ') question = input('Question: ') print("Answer: ", answer(model, tokenizer, context, question))
def _horovod_init(framework): size = comm_size() rank = comm_rank() hcl_config = get_hcl_config() hcl_type = get_hcl_type(hcl_config) if hcl_type != "HLS1-H": # All env variables should be set before loading_habana_modules if is_hierarchical(): os.environ["HLS1_MODULE_ID"] = str(comm_local_rank()) os.environ["ID"] = str(comm_local_rank()) else: if size > 1: os.environ["HLS1_MODULE_ID"] = str(get_hw_module_id(rank)) os.environ["ID"] = str(get_hw_module_id(rank)) # Make sure every rank logging to different file # Only important on the same machine - so pretty much every scenarios if size > 1: rank_prefix = "rank_{}_".format(rank) HorovodHelpers._set_env_prefix("TF_RANK_PREFIX", rank_prefix, False) HorovodHelpers._set_env_prefix("HBN_TF_GRAPH_PREFIX", rank_prefix, False) HorovodHelpers._set_env_prefix("TF_DUMP_GRAPH_PREFIX", rank_prefix, True) HorovodHelpers._hvd_rank_prefix = rank_prefix # Init synapse logger (if required) synapse_logger_init() # Init TF Module (for CPU Allocator) load_habana_module() # Temporary WA to support both paths: with and without habana_frameworks package installed try: from habana_frameworks.tensorflow.lib_utils import libraries_location tf.load_library( os.path.join(libraries_location, "libsynapse_helpers.so." + tf.__version__)) except: logging.warning( "Can't import habana_frameworks, trying to run anyway") if framework == Framework.TENSORFLOW: import horovod.tensorflow as hvd elif framework == Framework.KERAS: import horovod.tensorflow.keras as hvd else: raise Exception( "Specified framework: {} is not supported by horovod_helpers". format(framework)) hvd.init() assert rank == hvd.rank( ), "There is possible rank mismatch between mpi and horovod" HorovodHelpers._hvd = hvd
def main(_): common.initialize_preloading() if flags.FLAGS.use_horovod: hvd_init() else: synapse_logger_init() load_habana_module() with dump_callback(): model_helpers.apply_clean(flags.FLAGS) with logger.benchmark_context(flags.FLAGS): stats = run(flags.FLAGS) logging.info('Run stats:\n%s', stats)
def main(_): tf.disable_v2_behavior() tf.enable_resource_variables() tf.logging.set_verbosity(tf.logging.INFO) trainer_lib.set_random_seed(FLAGS.random_seed) usr_dir.import_usr_dir(FLAGS.t2t_usr_dir) if FLAGS.use_hpu: if FLAGS.use_bf16: if not is_workaround_enabled('FORCE_FP32'): os.environ['TF_BF16_CONVERSION'] = FLAGS.bf16_config_path else: print("Warning! BF16 precision is not supported in inference mode. Switching back to fp32...") if is_workaround_enabled('DISABLE_DYNAMIC_SHAPES'): os.environ['TF_ENABLE_DYNAMIC_SHAPES'] = 'false' from habana_frameworks.tensorflow import load_habana_module load_habana_module() prepare_recipe_cache() if FLAGS.score_file: filename = os.path.expanduser(FLAGS.score_file) if not tf.gfile.Exists(filename): raise ValueError("The file to score doesn't exist: %s" % filename) results = score_file(filename) if not FLAGS.decode_to_file: raise ValueError("To score a file, specify --decode_to_file for results.") write_file = tf.gfile.Open(os.path.expanduser(FLAGS.decode_to_file), "w") for score in results: write_file.write("%.6f\n" % score) write_file.close() return hp = create_hparams() hp.add_hparam("use_hpu", FLAGS.use_hpu) decode_hp = create_decode_hparams() run_config = trainer.create_run_config(hp) if FLAGS.disable_grappler_optimizations: run_config.session_config.graph_options.rewrite_options.disable_meta_optimizer = True # summary-hook in tf.estimator.EstimatorSpec requires # hparams.model_dir to be set. hp.add_hparam("model_dir", run_config.model_dir) estimator = trainer_lib.create_estimator( FLAGS.model, hp, run_config, decode_hparams=decode_hp, use_tpu=FLAGS.use_tpu) decode(estimator, hp, decode_hp)
def set_flags(params): if params.tf_verbosity: os.environ['TF_CPP_MIN_LOG_LEVEL'] = params.tf_verbosity if params.no_hpu: os.environ['CUDA_CACHE_DISABLE'] = '1' os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL' os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '0' os.environ['TF_ADJUST_HUE_FUSED'] = '1' os.environ['TF_ADJUST_SATURATION_FUSED'] = '1' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' os.environ['TF_SYNC_ON_FINISH'] = '0' else: from habana_frameworks.tensorflow import load_habana_module load_habana_module() if params.dtype == 'bf16': os.environ['TF_BF16_CONVERSION'] = params.bf16_config_path
def run_imagenet(flags_obj): """Run ResNet ImageNet training and eval loop. Args: flags_obj: An object containing parsed flag values. Returns: Dict of results of the run. Contains the keys `eval_results` and `train_hooks`. `eval_results` contains accuracy (top_1) and accuracy_top_5. `train_hooks` is a list the instances of hooks used during training. """ input_function = (flags_obj.use_synthetic_data and get_synth_input_fn( flags_core.get_tf_dtype(flags_obj)) or input_fn) tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) if flags.FLAGS.dtype == 'bf16': os.environ['TF_BF16_CONVERSION'] = flags.FLAGS.bf16_config_path # Disabling dynamic shapes is a workaround. Dynamic shapes support for ResNeXt shall be investigated os.environ["TF_ENABLE_DYNAMIC_SHAPES"] = "false" os.environ.setdefault("TF_DISABLE_MKL", "1") os.environ.setdefault("TF_ALLOW_CONTROL_EDGES_IN_HABANA_OPS", "1") if flags_obj.use_horovod: assert flags_obj.no_hpu == False, "Horovod without HPU is not supported in helpers." hvd_init() else: synapse_logger_init() if not flags_obj.no_hpu: load_habana_module() result = resnet_run_loop.resnet_main( flags_obj, imagenet_model_fn, input_function, DATASET_NAME, shape=[DEFAULT_IMAGE_SIZE, DEFAULT_IMAGE_SIZE, NUM_CHANNELS]) return result
def set_flags(params): if params.tf_verbosity: os.environ['TF_CPP_MIN_LOG_LEVEL'] = str(params.tf_verbosity) if not params.no_hpu: from habana_frameworks.tensorflow import load_habana_module load_habana_module() if params.dtype == 'bf16': os.environ['TF_BF16_CONVERSION'] = params.bf16_config_path np.random.seed(params.seed) tf.random.set_seed(params.seed) if params.use_xla: tf.config.optimizer.set_jit(True) per_hpu_thread_count = 1 num_hpus = hvd_size() if horovod_enabled() else 1 cpu_count = multiprocessing.cpu_count() total_hpu_thread_count = per_hpu_thread_count * num_hpus tf.config.threading.set_intra_op_parallelism_threads(0) tf.config.threading.set_inter_op_parallelism_threads(cpu_count - total_hpu_thread_count)
def main(argv): del argv # Unused. # ============================ Configure parameters ============================ # RUN_CONFIG = mask_rcnn_params.default_config() temp_config = FLAGS.flag_values_dict() if temp_config['device'] == 'HPU': if not MPI_is_distributed(): from habana_frameworks.tensorflow import load_habana_module load_habana_module() temp_config['learning_rate_decay_levels'] = [ float(decay) for decay in temp_config['learning_rate_decay_levels'] ] temp_config['learning_rate_levels'] = [ decay * temp_config['init_learning_rate'] for decay in temp_config['learning_rate_decay_levels'] ] temp_config['learning_rate_steps'] = [ int(step) for step in temp_config['learning_rate_steps'] ] RUN_CONFIG = params_io.override_hparams(RUN_CONFIG, temp_config) if temp_config['deterministic']: tf.config.threading.set_inter_op_parallelism_threads(1) tf.config.threading.set_intra_op_parallelism_threads(1) if temp_config['seed']: os.environ['TF_DETERMINISTIC_OPS'] = '1' tf.compat.v1.reset_default_graph() tf.random.set_seed(temp_config['seed']) if temp_config['device'] == "GPU": os.environ['TF_CUDNN_DETERMINISTIC'] = '1' else: raise RuntimeError("Set seed to run in deterministic mode") # ============================ Configure parameters ============================ # if RUN_CONFIG.use_tf_distributed and MPI_is_distributed(): raise RuntimeError( "Incompatible Runtime. Impossible to use `--use_tf_distributed` with MPIRun Horovod" ) if RUN_CONFIG.mode in ('train', 'train_and_eval' ) and not RUN_CONFIG.training_file_pattern: raise RuntimeError( 'You must specify `training_file_pattern` for training.') if RUN_CONFIG.mode in ('eval', 'train_and_eval'): if not RUN_CONFIG.validation_file_pattern: raise RuntimeError( 'You must specify `validation_file_pattern` for evaluation.') if RUN_CONFIG.val_json_file == "" and not RUN_CONFIG.include_groundtruth_in_features: raise RuntimeError( 'You must specify `val_json_file` or include_groundtruth_in_features=True for evaluation.' ) if not RUN_CONFIG.include_groundtruth_in_features and not os.path.isfile( RUN_CONFIG.val_json_file): raise FileNotFoundError("Validation JSON File not found: %s" % RUN_CONFIG.val_json_file) dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=RUN_CONFIG.log_path) ]) if RUN_CONFIG.mode in ('train', 'train_and_eval'): train_input_fn = dataloader.InputReader( file_pattern=RUN_CONFIG.training_file_pattern, mode=tf.estimator.ModeKeys.TRAIN, num_examples=None, use_fake_data=RUN_CONFIG.use_fake_data, use_instance_mask=RUN_CONFIG.include_mask, seed=RUN_CONFIG.seed) else: train_input_fn = None if RUN_CONFIG.mode in ('eval', 'train_and_eval') or ( RUN_CONFIG.mode == 'train' and RUN_CONFIG.eval_after_training): eval_input_fn = dataloader.InputReader( file_pattern=RUN_CONFIG.validation_file_pattern, mode=tf.estimator.ModeKeys.PREDICT, num_examples=RUN_CONFIG.eval_samples, use_fake_data=False, use_instance_mask=RUN_CONFIG.include_mask, seed=RUN_CONFIG.seed) else: eval_input_fn = None with tf.profiler.experimental.Profile( RUN_CONFIG.model_dir) if RUN_CONFIG.profile else suppress(): run_executer(RUN_CONFIG, train_input_fn, eval_input_fn)
def main(): parser = DenseNetArgumentParser( description=( "train.py is the main training/evaluation script for DenseNet. " "In order to run training on multiple Gaudi cards, use demo_densenet.py or run " "train.py with mpirun.")) args, _ = parser.parse_known_args() strategy = None verbose = 1 os.environ['ENABLE_EXPERIMENTAL_FLAGS'] = 'true' os.environ['RUN_TPC_FUSER'] = '******' if args.deterministic: if args.inputs is None: raise ValueError("Must provide inputs for deterministic mode") if args.resume_from_checkpoint_path is None: raise ValueError("Must provide checkpoint for deterministic mode") if args.dtype == 'bf16': os.environ['TF_BF16_CONVERSION'] = '1' if args.run_on_hpu: load_habana_module() if args.use_hpu_strategy: hls_addresses = str(os.environ.get( "MULTI_HLS_IPS", "127.0.0.1")).split(",") TF_BASE_PORT = 2410 mpi_rank = comm_rank() mpi_size = comm_size() if mpi_rank > 0: verbose = 0 worker_hosts = "" for address in hls_addresses: # worker_hosts: comma-separated list of worker ip:port pairs. worker_hosts = worker_hosts + ",".join( [address + ':' + str(TF_BASE_PORT + rank) for rank in range(mpi_size//len(hls_addresses))]) task_index = mpi_rank # Configures cluster spec for distribution strategy. _ = distribution_utils.configure_cluster(worker_hosts, task_index) strategy = HPUStrategy() print('Number of devices: {}'.format( strategy.num_replicas_in_sync)) else: strategy = tf.distribute.MultiWorkerMirroredStrategy() print('Number of devices: {}'.format(strategy.num_replicas_in_sync)) if args.seed is not None: os.environ['TF_DETERMINISTIC_OPS'] = '1' random.seed(args.seed) np.random.seed(args.seed) tf.random.set_seed(args.seed) img_rows, img_cols = 224, 224 # Resolution of inputs channel = 3 num_classes = 1000 batch_size = args.batch_size nb_epoch = args.epochs dataset_dir = args.dataset_dir resume_from_checkpoint_path = args.resume_from_checkpoint_path resume_from_epoch = args.resume_from_epoch dropout_rate = args.dropout_rate weight_decay = args.weight_decay optim_name = args.optimizer initial_lr = args.initial_lr model_name = args.model save_summary_steps = args.save_summary_steps if model_name == "densenet121": growth_rate = 32 nb_filter = 64 nb_layers = [6, 12, 24, 16] elif model_name == "densenet161": growth_rate = 48 nb_filter = 96 nb_layers = [6, 12, 36, 24] elif model_name == "densenet169": growth_rate = 32 nb_filter = 64 nb_layers = [6, 12, 32, 32] else: print("model is not supported") exit(1) # Load our model if strategy: with strategy.scope(): model = densenet_model(img_rows=img_rows, img_cols=img_cols, color_type=channel, dropout_rate=dropout_rate, weight_decay=weight_decay, num_classes=num_classes, growth_rate=growth_rate, nb_filter=nb_filter, nb_layers=nb_layers) optimizer = get_optimizer( model_name, optim_name, initial_lr, epsilon=1e-2) model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy']) else: model = densenet_model(img_rows=img_rows, img_cols=img_cols, color_type=channel, dropout_rate=dropout_rate, weight_decay=weight_decay, num_classes=num_classes, growth_rate=growth_rate, nb_filter=nb_filter, nb_layers=nb_layers) optimizer = get_optimizer( model_name, optim_name, initial_lr, epsilon=1e-2) model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy']) # Start training steps_per_epoch = 1281167 // batch_size if args.steps_per_epoch is not None: steps_per_epoch = args.steps_per_epoch validation_steps = 50000 // batch_size if args.validation_steps is not None: validation_steps = args.validation_steps warmup_steps = args.warmup_epochs * steps_per_epoch lr_sched = {0: 1, 30: 0.1, 60: 0.01, 80: 0.001} lr_sched_steps = { epoch * steps_per_epoch: multiplier for (epoch, multiplier) in lr_sched.items()} lrate = StepLearningRateScheduleWithWarmup(initial_lr=initial_lr, initial_global_step=0, warmup_steps=warmup_steps, decay_schedule=lr_sched_steps, verbose=0) save_name = model_name if not model_name.endswith('.h5') else \ os.path.split(model_name)[-1].split('.')[0].split('-')[0] model_ckpt = tf.keras.callbacks.ModelCheckpoint( os.path.join(args.model_dir, config.SAVE_DIR, save_name) + '-ckpt-{epoch:03d}.h5', monitor='train_loss') callbacks = [lrate, model_ckpt] if save_summary_steps is not None and save_summary_steps > 0: log_dir = os.path.join(args.model_dir, config.LOG_DIR) local_batch_size = batch_size if args.use_hpu_strategy: log_dir = os.path.join(log_dir, 'worker_' + str(comm_rank())) local_batch_size = batch_size // strategy.num_replicas_in_sync callbacks += [ TensorBoardWithHParamsV2( args.__dict__, log_dir=log_dir, update_freq=save_summary_steps, profile_batch=0), ExamplesPerSecondKerasHookV2( save_summary_steps, output_dir=log_dir, batch_size=local_batch_size), ] if (args.evaluate_checkpoint_path is not None): model.load_weights(args.evaluate_checkpoint_path) results = model.evaluate(x=ds_valid, steps=validation_steps) print("Test loss, Test acc:", results) exit() if ((resume_from_epoch is not None) and (resume_from_checkpoint_path is not None)): model.load_weights(resume_from_checkpoint_path) if args.deterministic: set_deterministic() if not os.path.isfile(args.dump_config): raise FileNotFoundError("wrong dump config path") import pickle x_path = os.path.join(args.inputs, "input") y_path = os.path.join(args.inputs, "target") x = pickle.load(open(x_path, 'rb')) y = pickle.load(open(y_path, 'rb')) with dump_callback(args.dump_config): model.fit(x=x, y=y, steps_per_epoch=steps_per_epoch, callbacks=callbacks, initial_epoch=resume_from_epoch, epochs=nb_epoch, shuffle=False, verbose=verbose, validation_data=None, validation_steps=0, ) else: ds_train = get_dataset(dataset_dir, args.train_subset, batch_size) ds_valid = get_dataset(dataset_dir, args.val_subset, batch_size) model.fit(x=ds_train, y=None, steps_per_epoch=steps_per_epoch, callbacks=callbacks, initial_epoch=resume_from_epoch, epochs=nb_epoch, shuffle=True, verbose=verbose, validation_data=(ds_valid, None), validation_steps=validation_steps, validation_freq=1, )
def main(argv): tf.disable_v2_behavior() tf.enable_resource_variables() if FLAGS.use_hpu and FLAGS.recipe_cache: prepare_recipe_cache() if FLAGS.use_horovod: if FLAGS.use_hpu: from TensorFlow.common.horovod_helpers import hvd_init, horovod_enabled, hvd hvd_init() assert horovod_enabled() if FLAGS.recipe_cache: # Other ranks should wait for recipe cache to be removed. # This operation can't be done before hvd_init. from mpi4py import MPI MPI.COMM_WORLD.Barrier() else: import horovod.tensorflow as hvd hvd.init() assert hvd.size() > 1 os.environ['CUDA_VISIBLE_DEVICES'] = str(hvd.local_rank()) if FLAGS.use_hpu: if FLAGS.use_bf16: os.environ['TF_BF16_CONVERSION'] = FLAGS.bf16_config_path dyn_shapes_flag = 'TF_ENABLE_DYNAMIC_SHAPES' if dyn_shapes_flag not in os.environ: os.environ[dyn_shapes_flag] = 'false' from habana_frameworks.tensorflow import load_habana_module # noqa load_habana_module() usr_dir.import_usr_dir(FLAGS.t2t_usr_dir) # If we just have to print the registry, do that and exit early. maybe_log_registry_and_exit() # Create HParams. if argv: set_hparams_from_args(argv[1:]) if FLAGS.schedule != "run_std_server": hparams = create_hparams() if FLAGS.gpu_automatic_mixed_precision: setattr(hparams, "gpu_automatic_mixed_precision", True) if FLAGS.deterministic_dataset: hparams.add_hparam("deterministic_dataset", True) hparams.add_hparam("use_horovod", FLAGS.use_horovod) hparams.add_hparam("use_hpu", FLAGS.use_hpu) if FLAGS.use_horovod: hparams.add_hparam("hvd_worker_id", hvd.rank()) hparams.add_hparam("hvd_size", hvd.size()) if FLAGS.schedule == "run_std_server": run_std_server() trainer_lib.set_random_seed(FLAGS.random_seed) if FLAGS.generate_data: generate_data() exp_fn = create_experiment_fn() exp = exp_fn(create_run_config(hparams), hparams) if is_chief(): save_metadata(hparams) with dump_callback(): execute_schedule(exp)
def train(model, train_images, train_annotations, input_height=None, input_width=None, n_classes=None, verify_dataset=True, checkpoints_path=None, epochs=5, batch_size=2, validate=False, val_images=None, val_annotations=None, auto_resume_checkpoint=False, load_weights=None, steps_per_epoch=None, val_steps_per_epoch=None, gen_use_multiprocessing=False, ignore_zero_class=False, optimizer_name='adam', do_augment=False, augmentation_name="aug_all", data_type='fp32', tb_location=None, deterministic=False, model_dir=None, dump_config=None, distributed=False, use_upsampling=False, loss_type=0, train_engine='hpu', not_cached=False): if train_engine == 'hpu': from habana_frameworks.tensorflow import load_habana_module load_habana_module() print("Loaded HPU modules") from TensorFlow.common.debug import dump_callback # For Habana Model runner hooks from TensorFlow.common.tb_utils import (TensorBoardWithHParamsV2, ExamplesPerSecondKerasHookV2) else: class dump_callback(object): def __init__(self, file_name): pass def __enter__(self): pass def __exit__(self, type, value, traceback): pass if data_type == 'bf16' and train_engine == 'hpu': bf16_json = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../bf16_segnet.json') os.environ['TF_BF16_CONVERSION'] = os.environ.get( 'TF_BF16_CONVERSION', bf16_json) print("Setting BF16:", os.getenv('TF_BF16_CONVERSION')) shard_id = 0 num_shards = 1 if distributed: import horovod.tensorflow.keras as hvd print("hvd init") hvd.init() if train_engine == 'gpu': gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices( gpus[hvd.local_rank()], 'GPU') print("Set memory growth for GPUS") shard_id = hvd.rank() num_shards = hvd.size() if num_shards == 1: print( "Distributed training requested but horovod init not success") exit() print("num_shards: " + str(num_shards) + " shard_id: " + str(shard_id)) from keras_segmentation.models.all_models import model_from_name # check if user gives model name instead of the model object if isinstance(model, six.string_types): # create the model from the name assert (n_classes is not None), "Please provide the n_classes" if (input_height is not None) and (input_width is not None): model = model_from_name[model](n_classes, input_height=input_height, input_width=input_width, batch_size=batch_size, use_upsampling=use_upsampling, loss_type=loss_type) else: model = model_from_name[model](n_classes, batch_size=batch_size, use_upsampling=use_upsampling, loss_type=loss_type) #model.save('my_segnet_model.h5') n_classes = model.n_classes input_height = model.input_height input_width = model.input_width output_height = model.output_height output_width = model.output_width if steps_per_epoch is None: steps_per_epoch = len( os.listdir(train_images)) // (batch_size * num_shards) if val_steps_per_epoch is None: val_steps_per_epoch = len(os.listdir(val_images)) // batch_size print("Steps per epoch: " + str(steps_per_epoch)) def optimized_xent_loss_custom_grad(ytrue, ypred): @tf.custom_gradient def loss_without_mean(ytrue, ypred): with tf.name_scope("softmax_cross_entropy"): logits_t = tf.transpose(ypred, perm=(0, 1, 3, 2), name="logits_t") # BS H N W reduce_max = tf.reduce_max(logits_t, 2, name="reduce_max") # BS H W max_logits = tf.expand_dims(reduce_max, 3) # BS H W 1 shifted_logits = tf.subtract(ypred, max_logits, name="shifted_logits") # BS H W N exp_shifted_logits = tf.math.exp( shifted_logits, name="exp_shifted_logits") # BS H W N reduce_sum_filter = tf.fill([1, 1, n_classes, 1], 1.0) sum_exp = tf.nn.conv2d(exp_shifted_logits, reduce_sum_filter, strides=1, padding="VALID", name="sum_exp") # BS H W 1 log_sum_exp = tf.math.log(sum_exp, name="log_sum_exp") # BS H W 1 shifted_logits2 = tf.nn.conv2d( shifted_logits * ytrue, reduce_sum_filter, strides=1, padding="VALID", name="shifted_logits2") # BS H W 1 loss = tf.subtract(log_sum_exp, shifted_logits2, name="loss/sub") # BS H W 1 def custom_grad(dy): # dy is BS H W 1 with tf.name_scope("gradients/softmax_cross_entropy"): div = tf.math.truediv(exp_shifted_logits, sum_exp, name="div") # BS H W N sub = tf.math.subtract(div, ytrue, name="sub") # BS H W N ret = tf.math.multiply(sub, dy, name="mul") return -dy * shifted_logits, ret return loss, custom_grad return tf.math.reduce_mean(loss_without_mean(ytrue, ypred)) if validate: assert val_images is not None assert val_annotations is not None if optimizer_name is not None: if ignore_zero_class: loss_k = masked_categorical_crossentropy elif loss_type == 1: loss_k = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True) elif loss_type == 2: loss_k = tf.keras.losses.CategoricalCrossentropy(from_logits=True) else: loss_k = optimized_xent_loss_custom_grad print(optimizer_name) if num_shards > 1: optimizer = Adam(lr=LearningRate) optimizer_name = hvd.DistributedOptimizer(optimizer) model.compile(loss=loss_k, optimizer=optimizer_name, metrics=['accuracy']) if checkpoints_path is not None: with open(checkpoints_path + "_config.json", "w") as f: json.dump( { "model_class": model.model_name, "n_classes": n_classes, "input_height": input_height, "input_width": input_width, "output_height": output_height, "output_width": output_width }, f) if load_weights is not None and len(load_weights) > 0: print("Loading weights from ", load_weights) status = model.load_weights(load_weights) print(status) if auto_resume_checkpoint and (checkpoints_path is not None): latest_checkpoint = find_latest_checkpoint(checkpoints_path) if latest_checkpoint is not None: print("Loading the weights from latest checkpoint ", latest_checkpoint) model.load_weights(latest_checkpoint) if verify_dataset: print("Verifying training dataset") verified = verify_segmentation_dataset(train_images, train_annotations, n_classes, deterministic) assert verified if validate: print("Verifying validation dataset") verified = verify_segmentation_dataset(val_images, val_annotations, n_classes, deterministic) assert verified if not_cached: train_gen = image_segmentation_generator( train_images, train_annotations, batch_size, n_classes, input_height, input_width, output_height, output_width, deterministic, do_augment=do_augment, augmentation_name=augmentation_name, num_shards=num_shards, shard_id=shard_id, loss_type=loss_type) else: train_gen = image_segmentation_generator( train_images, train_annotations, 1, n_classes, input_height, input_width, output_height, output_width, deterministic, do_augment=do_augment, augmentation_name=augmentation_name, num_shards=num_shards, shard_id=shard_id, loss_type=loss_type) train_gen = cached_image_generator(train_gen, num_shards, shard_id, batch_size, len(os.listdir(train_images)), deterministic) callbacks = [] if num_shards > 1: callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) callbacks.append(hvd.callbacks.MetricAverageCallback()) callbacks.append(CheckpointsCallback(checkpoints_path)) #if shard_id == 0: # callbacks.append(ModelCheckpoint( self.checkpoints_path, monitor='loss', verbose=2, mode='min', save_best_only=True, save_weights_only=True)) if model_dir is not None: hparams = { "model_name": model, "optimizer": optimizer_name, "batch_size": batch_size } if train_engine == 'hpu': callbacks += [ TensorBoardWithHParamsV2(hparams, log_dir=model_dir, update_freq=5), ExamplesPerSecondKerasHookV2(5, batch_size=batch_size, output_dir=model_dir) ] if tb_location != '': tensorboard_callback = TensorBoard(log_dir=tb_location, histogram_freq=1) callbacks.append(tensorboard_callback) print("TB:", tb_location) if not validate: with dump_callback(dump_config): start_compilation = time.time() model.fit(train_gen, steps_per_epoch=1, epochs=1) stop_compilation = time.time() history = model.fit(train_gen, steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks, verbose=1 if shard_id == 0 else 0) stop_training = time.time() with open('./trainHistoryDict_' + str(shard_id), 'wb') as file_pi: pickle.dump(history.history, file_pi) avg_time_per_batch = (stop_training - stop_compilation) / (steps_per_epoch * epochs) print('Compile time in seconds:', (stop_compilation - start_compilation)) print('Average time per batch in seconds (leaving out compilation):', avg_time_per_batch) print('Average time per image in seconds (leaving out compilation)', avg_time_per_batch / batch_size) print('Average images per sec (leaving out compilation):', batch_size / avg_time_per_batch) if loss_type == 1: print('Eval for LOSS_FUNC_TYPE=1 is WIP') exit() if shard_id == 0: if not_cached: val_gen = image_segmentation_generator(val_images, val_annotations, batch_size, n_classes, input_height, input_width, output_height, output_width, deterministic, num_shards=1, shard_id=shard_id, loss_type=loss_type) else: val_gen = image_segmentation_generator(val_images, val_annotations, 1, n_classes, input_height, input_width, output_height, output_width, deterministic, num_shards=1, shard_id=shard_id, loss_type=loss_type) val_gen = cached_image_generator(val_gen, 1, 0, batch_size, len(os.listdir(val_images))) f1_metric = FBetaScore(num_classes=n_classes) model.compile(loss=model.loss, metrics=[ tf.keras.metrics.CategoricalAccuracy( name="categorical_accuracy", dtype=None), f1_metric ]) test_loss, test_acc, test_f1 = model.evaluate( val_gen, steps=(len(os.listdir(val_images)) // batch_size)) train_loss, train_acc, train_f1 = model.evaluate( train_gen, steps=(len(os.listdir(train_images)) // batch_size)) print( f'test loss : {test_loss}, test accuracy : {test_acc}, test f1 : {test_f1}' ) print( f'train loss : {train_loss}, train accuracy : {train_acc}, train f1 : {train_f1}' ) else: assert ( num_shards is 1), "Only support training with validation with single HPU setup" if not_cached: val_gen = image_segmentation_generator(val_images, val_annotations, batch_size, n_classes, input_height, input_width, output_height, output_width, deterministic, num_shards=num_shards, shard_id=shard_id, loss_type=loss_type) else: val_gen = image_segmentation_generator(val_images, val_annotations, 1, n_classes, input_height, input_width, output_height, output_width, deterministic, num_shards=num_shards, shard_id=shard_id, loss_type=loss_type) val_gen = cached_image_generator(val_gen, num_shards, shard_id, batch_size, len(os.listdir(val_images)), deterministic) start_compilation = time.time() model.fit(train_gen, steps_per_epoch=1, epochs=1) stop_compilation = time.time() model.fit(train_gen, steps_per_epoch=steps_per_epoch, validation_data=val_gen, validation_steps=val_steps_per_epoch, epochs=epochs, callbacks=callbacks, use_multiprocessing=gen_use_multiprocessing, verbose=1 if shard_id == 0 else 0) stop_training = time.time() avg_time_per_batch = (stop_training - stop_compilation) / (steps_per_epoch * epochs) print('Compile time in seconds:', (stop_compilation - start_compilation)) print('Average time per batch in seconds (leaving out compilation):', avg_time_per_batch) print('Average time per image in seconds (leaving out compilation)', avg_time_per_batch / batch_size)
action='store_true', help='disables evaluation') parser.add_argument('--dump_config', type=str, default=None, help='Side-by-side config file. Internal, do not use.') params = parser.parse_args() print( f"Using TF {tf.__version__}, datasets {datasets.__version__}, transformers {transformers.__version__}" ) # Load Habana module in order to train on HPU (Gaudi) if not params.no_hpu: from habana_frameworks.tensorflow import load_habana_module load_habana_module() # Load dataset assert os.path.exists(params.data_dir), ( f'"{params.data_dir}" does not exist! Use "prepare_data.py" to create required data.' ) train_ds = datasets.load_from_disk( os.path.join(params.data_dir, 'squad', 'train')) valid_ds = datasets.load_from_disk( os.path.join(params.data_dir, 'squad', 'valid')) print("Example data from the mapped dataset: \n", next(iter(train_ds))) tf_train_ds = dataset.to_tf_dataset(train_ds) tf_valid_ds = dataset.to_tf_dataset(valid_ds)
def train_mnist(use_hpu: bool, batch_size: int, use_bfloat: bool, num_epochs: int): """ Train the distributed model on MNIST Dataset. """ # Set TF_CONFIG. set_tf_config() # Instantiate the distributed strategy class. if use_hpu: # Optionally enable automatic bfloat16 operations conversion. if use_bfloat: os.environ["TF_BF16_CONVERSION"] = "full" print( f"TF_BF16_CONVERSION = {os.environ['TF_BF16_CONVERSION']}") # Load Habana device support. from habana_frameworks.tensorflow import load_habana_module load_habana_module() # Use HPUStrategy (instead of MultiWorkerMirroredStrategy). from habana_frameworks.tensorflow.distribute import HPUStrategy strategy = HPUStrategy() else: strategy = tf.distribute.MultiWorkerMirroredStrategy() # Determine the total training batch size. batch_size_per_replica = batch_size total_batch_size = batch_size_per_replica * strategy.num_replicas_in_sync print( f"total_batch_size = {batch_size_per_replica} * {strategy.num_replicas_in_sync} workers = {total_batch_size}") # Load and preprocess the MNIST Dataset. # As tfds.load() may download the dataset if not cached, let the first worker do it first. for dataload_turn in range(2): if (dataload_turn == 0) == (worker_index == 0): print("Loading MNIST dataset...") datasets, info = tfds.load( name="mnist", with_info=True, as_supervised=True) MPI.COMM_WORLD.barrier() def preprocess(image, label): image = tf.cast(image, tf.float32) / 255.0 label = tf.cast(label, tf.int32) return image, label train_dataset = datasets["train"] options = tf.data.Options() options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA train_dataset = train_dataset.with_options(options) train_dataset = train_dataset.map( preprocess).cache().shuffle(SHUFFLE_BUFFER_SIZE).batch(total_batch_size) test_dataset = datasets["test"] options = tf.data.Options() options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF test_dataset = test_dataset.with_options(options) test_dataset = test_dataset.map( preprocess).batch(total_batch_size) # Create and compile the distributed CNN model. with strategy.scope(): model = tf.keras.Sequential([ tf.keras.layers.Conv2D( 32, 3, activation="relu", input_shape=(28, 28, 1)), tf.keras.layers.MaxPooling2D(), tf.keras.layers.Flatten(), tf.keras.layers.Dense(64, activation="relu"), tf.keras.layers.Dense(10) ]) model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=tf.keras.optimizers.Adam(), metrics=["accuracy"]) # Train the model. print("Calling model.fit()...") model.fit(train_dataset, epochs=num_epochs, verbose=2) print("Calling model.evaluate()...") eval_results = model.evaluate(test_dataset, verbose=2) print(f"Evaluation results: {eval_results}")
def main(_): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params) params = train_utils.parse_configuration(FLAGS) if params.runtime.num_hpus > 0: import os #TODO: remove when SW-49334 is fixed [SW-49404] os.environ["TF_DISABLE_EAGER_TO_FUNC_REWRITER"] = "1" from habana_frameworks.tensorflow import load_habana_module load_habana_module() if params.task.train_data.deterministic or params.task.validation_data.deterministic: import os os.environ['PYTHONHASHSEED'] = '0' os.environ['TF_DETERMINISTIC_OPS'] = '1' import numpy numpy.random.seed(0) import tensorflow as tf tf.random.set_seed(0) tf.compat.v1.set_random_seed(0) import random random.seed(0) if FLAGS.dtype == "bf16": print("Using bf16 config list {}".format(FLAGS.bf16_config_path)) os.environ['TF_BF16_CONVERSION'] = FLAGS.bf16_config_path hls_addresses = str(os.environ.get("MULTI_HLS_IPS", "127.0.0.1")).split(",") TF_BASE_PORT = 2410 mpi_rank = comm_rank() mpi_size = comm_size() if params.runtime.num_hpus > 1: model_dir = os.path.join(FLAGS.model_dir, "worker_" + str(mpi_rank)) else: model_dir = FLAGS.model_dir #prepare a comma-seperated list of device addreses worker_list = [] for address in hls_addresses: for rank in range(mpi_size // len(hls_addresses)): worker_list.append(address + ':' + str(TF_BASE_PORT + rank)) worker_hosts = ",".join(worker_list) task_index = mpi_rank # Configures cluster spec for distribution strategy. distribution_utils.configure_cluster(worker_hosts, task_index) if 'train' in FLAGS.mode: # Pure eval modes do not output yaml files. Otherwise continuous eval job # may race against the train job for writing the same file. train_utils.serialize_config(params, model_dir) # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16' # can have significant impact on model speeds by utilizing float16 in case of # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when # dtype is float16 if params.runtime.mixed_precision_dtype: performance.set_mixed_precision_policy( params.runtime.mixed_precision_dtype) distribution_strategy = distribution_utils.get_distribution_strategy( distribution_strategy=params.runtime.distribution_strategy, all_reduce_alg=params.runtime.all_reduce_alg, num_gpus=params.runtime.num_gpus, num_hpus=params.runtime.num_hpus, tpu_address=params.runtime.tpu) with distribution_strategy.scope(): task = task_factory.get_task(params.task, logging_dir=model_dir) train_lib.run_experiment(distribution_strategy=distribution_strategy, task=task, mode=FLAGS.mode, params=params, model_dir=model_dir) train_utils.save_gin_config(FLAGS.mode, model_dir)
def main(): parser = argparse.ArgumentParser(description=DESCRIPTION) parser.add_argument('--dataset', '--dataset_dir', metavar='PATH', default=config.DEFAULT_DATASET_DIR, help='Dataset directory.') parser.add_argument('--optimizer', default='sgd', choices=['sgd', 'adam', 'rmsprop'], help='Optimizer.') parser.add_argument('-d', '--dtype', default='fp32', choices=['fp32', 'bf16'], help='Data type.') parser.add_argument('--batch_size', type=int, default=32, help='Global batch size.') parser.add_argument('--lr_sched', default='WarmupCosine', choices=[ 'linear', 'exp', 'steps', 'constant', 'WarmupCosine'], help='Learning rate scheduler.') parser.add_argument('--initial_lr', type=float, default=6e-2, help='Initial learning rate.') parser.add_argument('--final_lr', type=float, default=1e-5, help='Final learning rate.') parser.add_argument('--warmup_steps', type=int, default=4000, help='Warmup steps.') parser.add_argument('--epochs', type=int, default=10, help='Total number of epochs for training.') parser.add_argument('--steps_per_epoch', type=int, help='Number of steps for training per epoch, overrides default value.') parser.add_argument('--validation_steps', type=int, help='Number of steps for validation, overrides default value.') parser.add_argument('--model', default='ViT-B_16', choices=['ViT-B_16', 'ViT-L_16', 'ViT-B_32', 'ViT-L_32'], help='Model.') parser.add_argument('--train_subset', default='train', help='Pattern to detect train subset in dataset directory.') parser.add_argument('--val_subset', default='validation', help='Pattern to detect validation subset in dataset directory.') parser.add_argument('--grad_accum_steps', type=int, default=8, help='Gradient accumulation steps.') parser.add_argument('--resume_from_checkpoint_path', metavar='PATH', help='Path to checkpoint to start from.') parser.add_argument('--resume_from_epoch', metavar='EPOCH_INDEX', type=int, default=0, help='Initial epoch index.') parser.add_argument('--evaluate_checkpoint_path', metavar='PATH', help='Checkpoint path for evaluating the model on --val_subset') parser.add_argument('--weights_path', metavar='PATH', help='Path to weights cache directory. ~/.keras is used if not set.') parser.add_argument('--deterministic', action='store_true', default=False, help='Enable deterministic behavior, this will also disable data augmentation. --seed must be set.') parser.add_argument('--seed', type=int, help='Seed to be used by random functions.') parser.add_argument('--device', default='HPU', choices=['CPU', 'HPU'], help='Device type.') parser.add_argument('--distributed', action='store_true', default=False, help='Enable distributed training.') parser.add_argument('--base_tf_server_port', type=int, default=7850, help='Rank 0 port used by tf.distribute.') parser.add_argument('--save_summary_steps', type=int, default=0, help='Steps between saving summaries to TensorBoard.') parser.add_argument('--recipe_cache', default='/tmp/vit_recipe_cache', help='Path to recipe cache directory. Set to empty to disable recipe cache. Externally set \'TF_RECIPE_CACHE_PATH\' will override this setting.') parser.add_argument( '--dump_config', help='Side-by-side config file. Internal, do not use.') args = parser.parse_args() if args.weights_path is not None: config.WEIGHTS_DIR = args.weights_path if args.dtype == 'bf16': tf.keras.mixed_precision.set_global_policy('mixed_bfloat16') if args.device == 'HPU': if args.distributed: os.environ['TF_HCCL_MEMORY_ALLOWANCE_MB'] = '500' from habana_frameworks.tensorflow import load_habana_module from habana_frameworks.tensorflow.ops.layer_norm import HabanaLayerNormalization load_habana_module() tf.keras.layers.LayerNormalization = HabanaLayerNormalization # Handle recipe caching. recipe_cache = args.recipe_cache if 'TF_RECIPE_CACHE_PATH' not in os.environ.keys() and recipe_cache: os.environ['TF_RECIPE_CACHE_PATH'] = recipe_cache # Clear previous recipe cache. if not args.distributed or comm_rank() == 0: if os.path.exists(recipe_cache) and os.path.isdir(recipe_cache): import shutil shutil.rmtree(recipe_cache) # Wait for rank 0 to remove cache. if args.distributed: from mpi4py import MPI MPI.COMM_WORLD.Barrier() # Handle determinism. config.DETERMINISTIC = args.deterministic config.SEED = args.seed if args.deterministic: assert args.seed is not None, "Deterministic behavior require seed to be set." tf.config.threading.set_inter_op_parallelism_threads(1) tf.config.threading.set_intra_op_parallelism_threads(1) os.environ['TF_DETERMINISTIC_OPS'] = '1' config.DATA_AUGMENTATION = False if args.seed is not None: random.seed(args.seed) np.random.seed(args.seed) tf.random.set_seed(args.seed) # Handle distribution strategy. if args.distributed: tf_distribute_config(args.base_tf_server_port) if args.device == 'HPU': os.environ['HBN_TF_REGISTER_DATASETOPS'] = '1' from habana_frameworks.tensorflow.distribute import HPUStrategy strategy = HPUStrategy() else: strategy = tf.distribute.MultiWorkerMirroredStrategy() else: strategy = tf.distribute.OneDeviceStrategy(f'device:{args.device}:0') if not args.distributed or comm_rank() == 0: print('Number of devices: {}'.format(strategy.num_replicas_in_sync)) num_classes = 1000 batch_size = args.batch_size nb_epoch = args.epochs dataset = args.dataset resume_from_checkpoint_path = args.resume_from_checkpoint_path resume_from_epoch = args.resume_from_epoch optim_name = args.optimizer initial_lr = args.initial_lr final_lr = args.final_lr lr_sched = args.lr_sched warmup_steps = args.warmup_steps model_name = args.model grad_accum_steps = args.grad_accum_steps ds_train = get_dataset(dataset, args.train_subset, batch_size, is_training=True, distributed=args.distributed) ds_valid = get_dataset(dataset, args.val_subset, batch_size, False, distributed=args.distributed) if args.dump_config is not None: vit.CONFIG_B['dropout'] = 0.0 vit.CONFIG_L['dropout'] = 0.0 # Load our model with strategy.scope(): image_size = 384 if model_name == 'ViT-B_16': model = vit.vit_b16( image_size=image_size, activation='softmax', pretrained=True, include_top=True, pretrained_top=False, classes=num_classes, weights="imagenet21k") elif model_name == 'ViT-L_16': model = vit.vit_l16( image_size=image_size, activation='softmax', pretrained=True, include_top=True, pretrained_top=False, classes=num_classes, weights="imagenet21k") elif model_name == 'ViT-B_32': model = vit.vit_b32( image_size=image_size, activation='softmax', pretrained=True, include_top=True, pretrained_top=False, classes=num_classes, weights="imagenet21k") elif model_name == 'ViT-L_32': model = vit.vit_l32( image_size=image_size, activation='softmax', pretrained=True, include_top=True, pretrained_top=False, classes=num_classes, weights="imagenet21k") else: print( "Model is not supported, please use either ViT-B_16 or ViT-L_16 or ViT-B_32 or ViT-L_32") exit(0) optimizer = get_optimizer( optim_name, initial_lr, accumulation_steps=grad_accum_steps, epsilon=1e-2) model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'], run_eagerly=False) # Start training steps_per_epoch = 1281167 // batch_size if args.steps_per_epoch is not None: steps_per_epoch = args.steps_per_epoch validation_steps = 50000 // batch_size if args.validation_steps is not None: validation_steps = args.validation_steps total_steps = nb_epoch * steps_per_epoch resume_step = resume_from_epoch * steps_per_epoch lrate = get_lr_func(nb_epoch, lr_sched, initial_lr, final_lr, warmup_steps, resume_step, total_steps) save_name = model_name if not model_name.endswith('.h5') else \ os.path.split(model_name)[-1].split('.')[0].split('-')[0] model_ckpt = tf.keras.callbacks.ModelCheckpoint( os.path.join(config.SAVE_DIR, save_name) + '-ckpt-{epoch:03d}.h5', monitor='train_loss') callbacks = [lrate, model_ckpt] if args.save_summary_steps > 0: callbacks += [TensorBoardWithHParamsV2( vars(args), log_dir=config.LOG_DIR, update_freq=args.save_summary_steps)] callbacks += [ExamplesPerSecondKerasHookV2( output_dir=config.LOG_DIR, every_n_steps=args.save_summary_steps, batch_size=args.batch_size)] if (args.evaluate_checkpoint_path is not None): model.load_weights(args.evaluate_checkpoint_path) results = model.evaluate(x=ds_valid, steps=validation_steps) print("Test loss, Test acc:", results) exit() if ((resume_from_epoch is not None) and (resume_from_checkpoint_path is not None)): model.load_weights(resume_from_checkpoint_path) with dump_callback(args.dump_config): model.fit(x=ds_train, y=None, steps_per_epoch=steps_per_epoch, callbacks=callbacks, initial_epoch=resume_from_epoch, epochs=nb_epoch, shuffle=not args.deterministic, verbose=1 if not args.distributed else comm_rank() == 0, validation_data=(ds_valid, None), validation_steps=validation_steps, ) if not args.distributed or comm_rank() == 0: model.save(f'{config.SAVE_DIR}/{save_name}-model-final.h5')
def main(argv): del argv # Unused. # if given an efficentdet ckpt don't use default backbone ckpt if FLAGS.backbone_ckpt == BACKBONE_CKPT_DEFAULT_DIR and FLAGS.ckpt is not None: print("Using ckpt flag: {}, ignoring default backbone_ckpt: {}".format( FLAGS.ckpt, FLAGS.backbone_ckpt)) FLAGS.backbone_ckpt = None if FLAGS.use_horovod is not None: if FLAGS.dump_all_ranks: FLAGS.model_dir += "/worker_" + str(hvd.rank()) if not 'HOROVOD_CYCLE_TIME' in os.environ: os.environ['HOROVOD_CYCLE_TIME'] = '0.5' if not 'HABANA_HCCL_COMM_API' in os.environ: os.environ['HABANA_HCCL_COMM_API'] = '0' hvd_init() if not FLAGS.no_hpu: from habana_frameworks.tensorflow import load_habana_module load_habana_module() if FLAGS.use_horovod: assert (horovod_enabled()) set_env(use_amp=FLAGS.use_amp) # deterministic setting if FLAGS.sbs_test or FLAGS.deterministic: set_deterministic() # Check data path if FLAGS.mode in ( 'train', 'train_and_eval') and FLAGS.training_file_pattern is None: raise RuntimeError( 'You must specify --training_file_pattern for training.') if FLAGS.mode in ('eval', 'train_and_eval'): if FLAGS.validation_file_pattern is None: raise RuntimeError('You must specify --validation_file_pattern ' 'for evaluation.') if not FLAGS.val_json_file and not FLAGS.testdev_dir: raise RuntimeError( 'You must specify --val_json_file or --testdev for evaluation.' ) # Parse and override hparams config = hparams_config.get_detection_config(FLAGS.model_name) config.override(FLAGS.hparams) # The following is for spatial partitioning. `features` has one tensor while # `labels` had 4 + (`max_level` - `min_level` + 1) * 2 tensors. The input # partition is performed on `features` and all partitionable tensors of # `labels`, see the partition logic below. # In the TPUEstimator context, the meaning of `shard` and `replica` is the # same; follwing the API, here has mixed use of both. if FLAGS.use_spatial_partition: # Checks input_partition_dims agrees with num_cores_per_replica. if FLAGS.num_cores_per_replica != np.prod(FLAGS.input_partition_dims): raise RuntimeError( '--num_cores_per_replica must be a product of array' 'elements in --input_partition_dims.') labels_partition_dims = { 'mean_num_positives': None, 'source_ids': None, 'groundtruth_data': None, 'image_scales': None, } # The Input Partition Logic: We partition only the partition-able tensors. # Spatial partition requires that the to-be-partitioned tensors must have a # dimension that is a multiple of `partition_dims`. Depending on the # `partition_dims` and the `image_size` and the `max_level` in config, some # high-level anchor labels (i.e., `cls_targets` and `box_targets`) cannot # be partitioned. For example, when `partition_dims` is [1, 4, 2, 1], image # size is 1536, `max_level` is 9, `cls_targets_8` has a shape of # [batch_size, 6, 6, 9], which cannot be partitioned (6 % 4 != 0). In this # case, the level-8 and level-9 target tensors are not partition-able, and # the highest partition-able level is 7. image_size = config.get('image_size') for level in range(config.get('min_level'), config.get('max_level') + 1): def _can_partition(spatial_dim): partitionable_index = np.where( spatial_dim % np.array(FLAGS.input_partition_dims) == 0) return len(partitionable_index[0]) == len( FLAGS.input_partition_dims) spatial_dim = image_size // (2**level) if _can_partition(spatial_dim): labels_partition_dims['box_targets_%d' % level] = FLAGS.input_partition_dims labels_partition_dims['cls_targets_%d' % level] = FLAGS.input_partition_dims else: labels_partition_dims['box_targets_%d' % level] = None labels_partition_dims['cls_targets_%d' % level] = None num_cores_per_replica = FLAGS.num_cores_per_replica input_partition_dims = [ FLAGS.input_partition_dims, labels_partition_dims ] num_shards = FLAGS.num_cores // num_cores_per_replica else: num_cores_per_replica = None input_partition_dims = None num_shards = FLAGS.num_cores if horovod_enabled(): num_shards = hvd.size() else: num_shards = 1 params = build_estimator_params('train', config, num_shards) # disabling input data scaling/flip manipulations. if FLAGS.sbs_test: sbs_params = dict(input_rand_hflip=False, train_scale_min=1, train_scale_max=1, dropout_rate=0.0) params.update(sbs_params) tf_random_seed = 0 if FLAGS.deterministic else None run_config = build_estimator_config('train', config, num_shards, num_cores_per_replica, input_partition_dims) write_hparams_v1(FLAGS.model_dir, { 'batch_size': FLAGS.train_batch_size, **FLAGS.flag_values_dict() }) model_fn_instance = det_model_fn.get_model_fn(FLAGS.model_name) # TPU Estimator logging.info(params) if FLAGS.mode == 'train': train_estimator = HorovodEstimator(model_fn=model_fn_instance, model_dir=FLAGS.model_dir, config=run_config, params=params) # for deterministic input, we pass to dataloader False for not manipulating input data is_training = not FLAGS.deterministic use_fake_data = FLAGS.use_fake_data or FLAGS.deterministic input_fn = dataloader.InputReader(FLAGS.training_file_pattern, is_training=is_training, params=params, use_fake_data=use_fake_data, is_deterministic=FLAGS.deterministic) max_steps = int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / (FLAGS.train_batch_size * num_shards)) + 1 # for sbs test, train under sbs callbacks if FLAGS.sbs_test: from TensorFlow.common.debug import dump_callback SBS_TEST_CONFIG = os.path.join( os.environ['TF_TESTS_ROOT'], "tests/tf_training_tests/side_by_side/topologies/efficientdet/dump_config.json" ) with dump_callback(SBS_TEST_CONFIG): train_estimator.train(input_fn=input_fn, max_steps=max_steps) else: if FLAGS.ckpt is not None: train_estimator.train(input_fn=input_fn, steps=max_steps) else: train_estimator.train(input_fn=input_fn, max_steps=max_steps) elif FLAGS.mode == 'eval': eval_params = build_estimator_params('eval', config, num_shards) eval_config = build_estimator_config('eval', config, num_shards, num_cores_per_replica, input_partition_dims) # Eval only runs on CPU or GPU host with batch_size = 1. # Override the default options: disable randomization in the input pipeline # and don't run on the TPU. # Also, disable use_bfloat16 for eval on CPU/GPU. eval_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=False, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=eval_config, params=eval_params) def terminate_eval(): logging.info('Terminating eval after %d seconds of no checkpoints', FLAGS.eval_timeout) return True # Run evaluation when there's a new checkpoint for ckpt in tf.train.checkpoints_iterator( FLAGS.model_dir, min_interval_secs=FLAGS.min_eval_interval, timeout=FLAGS.eval_timeout, timeout_fn=terminate_eval): logging.info('Starting to evaluate.') try: eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples // FLAGS.eval_batch_size) logging.info('Eval results: %s', eval_results) # Terminate eval job when final checkpoint is reached. try: current_step = int(os.path.basename(ckpt).split('-')[1]) except IndexError: logging.info('%s has no global step info: stop!', ckpt) break write_summary(eval_results, ckpt, current_step) utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) total_step = int( (FLAGS.num_epochs * FLAGS.num_examples_per_epoch) / FLAGS.train_batch_size) if current_step >= total_step: logging.info('Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) elif FLAGS.mode == 'train_and_eval': train_params = build_estimator_params('train', config, num_shards) train_config = build_estimator_config('train', config, num_shards, num_cores_per_replica, input_partition_dims) train_estimator = HorovodEstimator(model_fn=model_fn_instance, model_dir=FLAGS.model_dir, config=train_config, params=train_params) eval_estimator = None for cycle in range(FLAGS.num_epochs): logging.info('Starting training cycle, epoch: %d.', cycle) train_estimator.train( input_fn=dataloader.InputReader( FLAGS.training_file_pattern, is_training=True, use_fake_data=FLAGS.use_fake_data), max_steps=(cycle + 1) * int(FLAGS.num_examples_per_epoch / FLAGS.train_batch_size)) # synchronization point for all ranks if horovod_enabled(): hvd.allreduce(tf.constant(0)) logging.info('Starting evaluation cycle, epoch: %d.', cycle) # Run evaluation after every epoch. if eval_estimator is None: eval_params = build_estimator_params('eval', config, num_shards) eval_config = build_estimator_config('eval', config, num_shards, num_cores_per_replica, input_partition_dims) eval_estimator = tf.estimator.tpu.TPUEstimator( model_fn=model_fn_instance, use_tpu=False, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, config=eval_config, params=eval_params) if is_rank0(): eval_results = eval_estimator.evaluate( input_fn=dataloader.InputReader( FLAGS.validation_file_pattern, is_training=False), steps=FLAGS.eval_samples // FLAGS.eval_batch_size) checkpoint_path = Path(FLAGS.model_dir) last_ckpt = tf.train.latest_checkpoint(str(checkpoint_path), latest_filename=None) current_step = int(os.path.basename(last_ckpt).split('-')[1]) write_summary(eval_results, FLAGS.model_dir, current_step) logging.info('Evaluation results: %s', eval_results) ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) utils.archive_ckpt(eval_results, eval_results['AP'], ckpt) pass else: logging.info('Mode not found.')
def setup_module(): htf.load_habana_module()
def main(): parser = CycleGANArgParser(is_demo=False) args = parser.parse_args() if not args.no_hpu: from habana_frameworks.tensorflow import load_habana_module load_habana_module() if args.habana_instance_norm: tfa.layers.InstanceNormalization = HabanaInstanceNormalization if args.data_type == 'bf16': tf.keras.mixed_precision.set_global_policy('mixed_bfloat16') if args.run_deterministic: tf.random.set_seed(12345) input_image_shape = (args.crop, args.crop, 3) input_transformation = TrasformInputs(orig_img_size=( args.resize, args.resize), input_img_size=(args.crop, args.crop)) horovod = None if args.use_horovod: from TensorFlow.common.horovod_helpers import hvd as horovod horovod.init() if args.log_all_workers: args.logdir = os.path.join(args.logdir, f"worker_{horovod.rank()}") tfds.disable_progress_bar() # Load the horse-zebra dataset using tensorflow-datasets. if is_local_master(args.use_horovod, horovod): dataset, _ = tfds.load("cycle_gan/horse2zebra", data_dir=args.dataset_dir, with_info=True, as_supervised=True, download=True) if args.use_horovod: horovod.broadcast(0, 0) # nodes synchronization else: if args.use_horovod: horovod.broadcast(0, 0) dataset, _ = tfds.load( "cycle_gan/horse2zebra", data_dir=args.dataset_dir, with_info=True, as_supervised=True) train_horses, train_zebras = dataset["trainA"], dataset["trainB"] test_horses, test_zebras = dataset["testA"], dataset["testB"] # Apply the preprocessing operations to the training data train_horses = ( train_horses.map( input_transformation.preprocess_train_image, num_parallel_calls=1 if args.run_deterministic else autotune) .cache() .shuffle(args.buffer) .batch(args.batch_size, drop_remainder=True) ) train_zebras = ( train_zebras.map( input_transformation.preprocess_train_image, num_parallel_calls=1 if args.run_deterministic else autotune) .cache() .shuffle(args.buffer) .batch(args.batch_size, drop_remainder=True) ) train_ds = tf.data.Dataset.zip((train_horses, train_zebras)) test_ds = test_horses, test_zebras disc_X = get_discriminator(input_image_shape, name="discriminator_X") disc_Y = get_discriminator(input_image_shape, name="discriminator_Y") gen_X = get_resnet_generator(input_image_shape, name="generator_X") gen_Y = get_resnet_generator(input_image_shape, name="generator_Y") # Create cycle gan model cycle_gan_model = CycleGan( generator_X=gen_X, generator_Y=gen_Y, discriminator_X=disc_X, discriminator_Y=disc_Y ) latest = None if args.restore: print(f"Trying to restore checkpoint from {args.logdir}") latest = tf.train.latest_checkpoint(args.logdir) if args.train: train(args, cycle_gan_model, train_ds, test_ds, latest, horovod) if args.test and is_master(args.use_horovod, horovod): eval(args, cycle_gan_model, test_ds, input_transformation, latest)