def get_datasets(): # Load dataset train_dataset, test_dataset = load_dataset("imdb", split=["train", "test"]) # Preprocess train dataset train_dataset = train_dataset.map( lambda e: tokenizer(e["text"], truncation=True, padding="max_length"), batched=True ) train_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"]) train_features = {x: train_dataset[x] for x in ["input_ids", "attention_mask"]} tf_train_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_dataset["label"])) # Preprocess test dataset test_dataset = test_dataset.map( lambda e: tokenizer(e["text"], truncation=True, padding="max_length"), batched=True ) test_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"]) test_features = {x: test_dataset[x] for x in ["input_ids", "attention_mask"]} tf_test_dataset = tf.data.Dataset.from_tensor_slices((test_features, test_dataset["label"])) if SDP_ENABLED: tf_train_dataset = tf_train_dataset.shard(sdp.size(), sdp.rank()) tf_test_dataset = tf_test_dataset.shard(sdp.size(), sdp.rank()) tf_train_dataset = tf_train_dataset.batch(args.train_batch_size, drop_remainder=True) tf_test_dataset = tf_test_dataset.batch(args.eval_batch_size, drop_remainder=True) return tf_train_dataset, tf_test_dataset
def train(args): # Load data from S3 # train_dir = os.environ.get('SM_CHANNEL_TRAIN') train_dir = args.train batch_size = args.batch_size dataset = get_train_data(train_dir, batch_size) model = get_resnet50(transfer_learning=True) loss_fn = tf.losses.SparseCategoricalCrossentropy() acc = tf.metrics.SparseCategoricalAccuracy(name='train_accuracy') # SMDataParallel: dist.size() # LR for 8 node run : 0.000125 # LR for single node run : 0.001 opt = tf.optimizers.Adam(args.learning_rate * dist.size()) checkpoint_dir = os.environ['SM_MODEL_DIR'] checkpoint = tf.train.Checkpoint(model=model, optimizer=opt) @tf.function def training_step(images, labels, first_batch): with tf.GradientTape() as tape: probs = model(images, training=True) loss_value = loss_fn(labels, probs) acc_value = acc(labels, probs) # SMDataParallel: Wrap tf.GradientTape with SMDataParallel's DistributedGradientTape tape = dist.DistributedGradientTape(tape) grads = tape.gradient(loss_value, model.trainable_variables) opt.apply_gradients(zip(grads, model.trainable_variables)) if first_batch: # SMDataParallel: Broadcast model and optimizer variables dist.broadcast_variables(model.variables, root_rank=0) dist.broadcast_variables(opt.variables(), root_rank=0) # SMDataParallel: all_reduce call loss_value = dist.oob_allreduce( loss_value) # Average the loss across workers acc_value = dist.oob_allreduce(acc_value) return loss_value, acc_value for epoch in range(args.epochs): for batch, (images, labels) in enumerate(dataset.take(10000 // dist.size())): loss_value, acc_value = training_step(images, labels, batch == 0) if batch % 100 == 0 and dist.rank() == 0: logger.info( '*** Epoch %d Step #%d Accuracy: %.6f Loss: %.6f ***' % (epoch, batch, acc_value, loss_value)) # SMDataParallel: Save checkpoints only from master node. if dist.rank() == 0: model.save(os.path.join(checkpoint_dir, '1'))
def create_train_and_eval_specs(train_input_fn, eval_input_fns, eval_on_train_input_fn, predict_input_fn, train_steps, eval_on_train_data=False, final_exporter_name='Servo', eval_spec_names=None): """Creates a `TrainSpec` and `EvalSpec`s. Args: train_input_fn: Function that produces features and labels on train data. eval_input_fns: A list of functions that produce features and labels on eval data. eval_on_train_input_fn: Function that produces features and labels for evaluation on train data. predict_input_fn: Function that produces features for inference. train_steps: Number of training steps. eval_on_train_data: Whether to evaluate model on training data. Default is False. final_exporter_name: String name given to `FinalExporter`. eval_spec_names: A list of string names for each `EvalSpec`. Returns: Tuple of `TrainSpec` and list of `EvalSpecs`. If `eval_on_train_data` is True, the last `EvalSpec` in the list will correspond to training data. The rest EvalSpecs in the list are evaluation datas. """ train_spec = tf.estimator.TrainSpec( input_fn=train_input_fn, max_steps=train_steps // hvd.size(), # no `steps' attribute; only max_steps available hooks=[hvd.BroadcastGlobalVariablesHook(0)]) if eval_spec_names is None: eval_spec_names = [str(i) for i in range(len(eval_input_fns))] eval_specs = [] for index, (eval_spec_name, eval_input_fn) in enumerate( zip(eval_spec_names, eval_input_fns)): # Uses final_exporter_name as exporter_name for the first eval spec for # backward compatibility. if index == 0: exporter_name = final_exporter_name else: exporter_name = '{}_{}'.format(final_exporter_name, eval_spec_name) exporter = tf.estimator.FinalExporter( name=exporter_name, serving_input_receiver_fn=predict_input_fn) eval_specs.append( tf.estimator.EvalSpec( name=eval_spec_name, input_fn=eval_input_fn, steps=None, exporters=exporter)) if eval_on_train_data: eval_specs.append( tf.estimator.EvalSpec( name='eval_on_train', input_fn=eval_on_train_input_fn, steps=None)) return train_spec, eval_specs
def _get_distribution_strategy(self) -> TFDistributionStrategy: try: import horovod.tensorflow as hvd if hvd.size(): return TFDistributionStrategy.HOROVOD except (ModuleNotFoundError, ValueError, ImportError): pass # smdistributed.dataparallel should be invoked via `mpirun`. # It supports EC2 machines with 8 GPUs per machine. if check_smdataparallel_env(): try: import smdistributed.dataparallel.tensorflow as smdataparallel # The total number of GPUs across all the nodes in the cluster if smdataparallel.size(): return TFDistributionStrategy.SMDATAPARALLEL except (ModuleNotFoundError, ValueError, ImportError): pass strat = tf.distribute.get_strategy() if is_mirrored_strategy(strat): return TFDistributionStrategy.MIRRORED if isinstance(strat, _DefaultDistributionStrategy): # single device return TFDistributionStrategy.NONE # Disable PS till we verify proper support of PS on SM # if self.tf_config_json and is_parameter_server_strategy(self.tf_config): # return TFDistributionStrategy.PARAMETER_SERVER return TFDistributionStrategy.UNSUPPORTED
def train(mnist_epochs): """ Train CNN :param mnist_epochs: number of training steps to run for :return: None """ for batch, (images, labels) in enumerate( dataset.take(mnist_epochs // dist.size())): loss_value = training_step(images, labels, batch) # print loss every 50 epochs in master worker if batch % 50 == 0 and dist.rank() == 0: print('Step #%d\tLoss: %.6f' % (batch, loss_value))
def get_dataset( tokenizer: PreTrainedTokenizer, processor: SquadProcessor, data_dir: str, filename: str, per_gpu_batch_size: int, shard: bool, drop_remainder: bool, shuffle: bool = True, max_seq_length: int = 384, doc_stride: int = 128, max_query_length: int = 64, evaluate: bool = False, return_raw_features: bool = False, repeat: bool = False, ) -> tf.data.Dataset: # Convert the data from a JSON file into a tf.data.Dataset # This function should also work to fetch the val_dataset if evaluate: examples: List[SquadExample] = processor.get_dev_examples( data_dir, filename=filename) else: examples: List[SquadExample] = processor.get_train_examples( data_dir, filename=filename) # dataset is a tuple of (features, dataset) dataset: List[SquadFeatures] = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=not evaluate, return_dataset=None if return_raw_features else "tf", threads=16, ) if return_raw_features: return dataset else: if shard: dataset = dataset.shard(smddp.size(), smddp.rank()) if shuffle: dataset = dataset.shuffle(buffer_size=1000, reshuffle_each_iteration=True) if repeat: dataset = dataset.repeat() dataset = dataset.batch(per_gpu_batch_size, drop_remainder=drop_remainder) if shuffle: dataset = dataset.shuffle(buffer_size=1000, reshuffle_each_iteration=True) return dataset
def get_tfrecords_input_fn(filenames, batch_size, height, width, training, distort_color, num_threads, deterministic): shuffle_buffer_size = 4096 if deterministic: if hvd_utils.is_using_hvd(): seed = 13 * (1 + hvd.rank()) else: seed = 13 else: seed = None ds = tf.data.Dataset.from_tensor_slices(filenames) if hvd_utils.is_using_hvd() and training: ds = ds.shard(hvd.size(), hvd.rank()) ds = ds.apply( tf.data.experimental.parallel_interleave(tf.data.TFRecordDataset, cycle_length=10, block_length=8, sloppy=not deterministic, prefetch_input_elements=16)) counter = tf.data.Dataset.range(sys.maxsize) ds = tf.data.Dataset.zip((ds, counter)) def preproc_func(record, counter_): return image_processing.preprocess_image_record( record, height, width, _NUM_CHANNELS, training) if training: ds = ds.apply( tf.data.experimental.shuffle_and_repeat( buffer_size=shuffle_buffer_size, seed=seed)) else: ds = ds.repeat() ds = ds.apply( tf.data.experimental.map_and_batch( map_func=preproc_func, num_parallel_calls=num_threads, batch_size=batch_size, drop_remainder=True, )) ds = ds.prefetch(buffer_size=tf.contrib.data.AUTOTUNE) return ds
def __init__(self, runtime_config, model_fn): super(EstimatorExecuter, self).__init__(runtime_config, model_fn) if MPI_is_distributed(): os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL' os.environ['HOROVOD_NUM_NCCL_STREAMS'] = '1' # os.environ['HOROVOD_AUTOTUNE'] = '2' logging.info("SageMaker Distributed Data Parallel successfully initialized ...") os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' os.environ['TF_GPU_THREAD_COUNT'] = '1' if not MPI_is_distributed() else str(hvd.size()) os.environ['TF_SYNC_ON_FINISH'] = '0'
def _get_session_config(mode, use_xla, use_dali, gpu_memory_fraction, gpu_id=0): if mode not in ["train", 'validation', 'benchmark', 'inference']: raise ValueError( "Unknown mode received: %s (allowed: 'train', 'validation', 'benchmark', 'inference')" % mode) # Limit available GPU memory (tune the size) if use_dali: gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_memory_fraction) config = tf.ConfigProto(gpu_options=gpu_options) config.gpu_options.allow_growth = False else: config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True config.log_device_placement = False config.gpu_options.visible_device_list = str(gpu_id) if hvd_utils.is_using_hvd(): config.gpu_options.visible_device_list = str(hvd.local_rank()) if use_xla: config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 config.gpu_options.force_gpu_compatible = True # Force pinned memory # Bug - disable bn+relu fusion from tensorflow.core.protobuf import rewriter_config_pb2 config.graph_options.rewrite_options.remapping = ( rewriter_config_pb2.RewriterConfig.OFF) if mode == 'train': config.intra_op_parallelism_threads = 1 # Avoid pool of Eigen threads if hvd_utils.is_using_hvd(): config.inter_op_parallelism_threads = max( 2, (multiprocessing.cpu_count() // hvd.size()) - 2) else: config.inter_op_parallelism_threads = 4 return config
def _get_num_workers(self): self._assert_distribution_strategy() if self.distribution_strategy == TFDistributionStrategy.HOROVOD: import horovod.tensorflow as hvd return hvd.size() elif self.distribution_strategy == TFDistributionStrategy.SMDATAPARALLEL: import smdistributed.dataparallel.tensorflow as smdataparallel return smdataparallel.size() elif self.distribution_strategy == TFDistributionStrategy.MIRRORED: strategy = tf.distribute.get_strategy() return strategy.num_replicas_in_sync elif self.distribution_strategy == TFDistributionStrategy.PARAMETER_SERVER: return get_num_workers_from_tf_config(self.tf_config_json) elif self.distribution_strategy == TFDistributionStrategy.NONE: return 1 elif self.distribution_strategy == TFDistributionStrategy.UNSUPPORTED: return 1
def create_model(mnist_learning_rate): """ Creates a new keras model for learning :param mnist_learning_rate: learning rate for the Adam Optimizer :return: mode, loss function, and optimizer """ # neural net mnist_model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), tf.keras.layers.Dropout(0.25), tf.keras.layers.Flatten(), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(10, activation='softmax') ]) mnist_loss = tf.losses.SparseCategoricalCrossentropy() # learning rate is proportional to number of workers mnist_optimizer = tf.optimizers.Adam(mnist_learning_rate * dist.size()) return mnist_model, mnist_loss, mnist_optimizer
def _get_num_workers(self): self._assert_distribution_strategy() if self.distribution_strategy == TFDistributionStrategy.HOROVOD: if _smp_imported and smp.core.initialized: # when model parallel is being used, there will be multiple hvd process groups, # hence use smp.size return smp.size() import horovod.tensorflow as hvd return hvd.size() elif self.distribution_strategy == TFDistributionStrategy.SMDATAPARALLEL: import smdistributed.dataparallel.tensorflow as smdataparallel return smdataparallel.size() elif self.distribution_strategy == TFDistributionStrategy.MIRRORED: strategy = tf.distribute.get_strategy() return strategy.num_replicas_in_sync elif self.distribution_strategy == TFDistributionStrategy.PARAMETER_SERVER: return get_num_workers_from_tf_config(self.tf_config_json) elif self.distribution_strategy == TFDistributionStrategy.NONE: return 1 elif self.distribution_strategy == TFDistributionStrategy.UNSUPPORTED: return 1
def __init__(self, filenames, idx_filenames, height, width, batch_size, num_threads, dtype=tf.uint8, dali_cpu=True, deterministic=False, training=False): device_id = hvd.local_rank() shard_id = hvd.rank() num_gpus = hvd.size() pipe = HybridPipe(tfrec_filenames=filenames, tfrec_idx_filenames=idx_filenames, height=height, width=width, batch_size=batch_size, num_threads=num_threads, device_id=device_id, shard_id=shard_id, num_gpus=num_gpus, deterministic=deterministic, dali_cpu=dali_cpu, training=training) daliop = dali_tf.DALIIterator() with tf.device("/gpu:0"): self.images, self.labels = daliop(pipeline=pipe, shapes=[(batch_size, height, width, 3), (batch_size, 1)], dtypes=[tf.float32, tf.int64], device_id=device_id)
def _get_global_batch_size(worker_batch_size): if hvd_utils.is_using_hvd(): return worker_batch_size * hvd.size() else: return worker_batch_size
parser.add_argument('--rank', type=int, default=0) # SageMaker Container environment parser.add_argument('--model_dir', type=str, default='../model') parser.add_argument('--data_dir', type=str, default='../data') args = parser.parse_args() try: args.model_dir = os.environ['SM_MODEL_DIR'] args.data_dir = os.environ['SM_CHANNEL_TRAINING'] except KeyError as e: print( "The model starts training on the local host without SageMaker TrainingJob." ) if not os.path.exists(args.model_dir): os.makedirs(args.model_dir) pass ######################################################## ####### 2. SageMaker Distributed Data Parallel ####### ####### - Get all number of GPU and rank number ####### ######################################################## args.size = smdp.size() # all number of GPU args.rank = smdp.rank() # total rank in all hosts args.local_rank = smdp.local_rank() # rank per host ######################################################## train(args)
def MPI_size(): return hr.size()
def main(unused_argv): tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.amp: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1" else: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "0" # Set seed to reduce randomness np.random.seed(FLAGS.seed) tf.set_random_seed(FLAGS.seed) hvd.init() flags.mark_flag_as_required('model_dir') flags.mark_flag_as_required('pipeline_config_path') session_config = tf.ConfigProto() session_config.gpu_options.per_process_gpu_memory_fraction=0.9 session_config.gpu_options.visible_device_list = str(hvd.local_rank()) if FLAGS.allow_xla: session_config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 model_dir = FLAGS.model_dir if hvd.rank() == 0 else None config = tf.estimator.RunConfig(tf_random_seed=(FLAGS.seed + hvd.rank()), model_dir=model_dir, session_config=session_config) train_and_eval_dict = model_lib.create_estimator_and_inputs( run_config=config, eval_count=FLAGS.eval_count, hparams=model_hparams.create_hparams(FLAGS.hparams_overrides), pipeline_config_path=FLAGS.pipeline_config_path, train_steps=FLAGS.num_train_steps, sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples, sample_1_of_n_eval_on_train_examples=( FLAGS.sample_1_of_n_eval_on_train_examples)) estimator = train_and_eval_dict['estimator'] train_input_fn = train_and_eval_dict['train_input_fn'] eval_input_fns = train_and_eval_dict['eval_input_fns'] eval_on_train_input_fn = train_and_eval_dict['eval_on_train_input_fn'] predict_input_fn = train_and_eval_dict['predict_input_fn'] train_steps = train_and_eval_dict['train_steps'] if FLAGS.checkpoint_dir: if FLAGS.eval_training_data: name = 'training_data' input_fn = eval_on_train_input_fn else: name = 'validation_data' # The first eval input will be evaluated. input_fn = eval_input_fns[0] if FLAGS.run_once: estimator.evaluate(input_fn, steps=None, checkpoint_path=tf.train.latest_checkpoint( FLAGS.checkpoint_dir)) else: model_lib.continuous_eval(estimator, FLAGS.checkpoint_dir, input_fn, train_steps, name) else: train_spec, eval_specs = model_lib.create_train_and_eval_specs( train_input_fn, eval_input_fns, eval_on_train_input_fn, predict_input_fn, train_steps, eval_on_train_data=False) train_hooks = [hvd.BroadcastGlobalVariablesHook(0), DLLoggerHook(hvd.size()*train_and_eval_dict['train_batch_size'], hvd.rank())] eval_hooks = [] for x in range(FLAGS.eval_count): estimator.train(train_input_fn, hooks=train_hooks, steps=train_steps // FLAGS.eval_count) if hvd.rank() == 0 and not FLAGS.train_only: eval_input_fn = eval_input_fns[0] results = estimator.evaluate(eval_input_fn, steps=None, hooks=eval_hooks)
def main(_): os.environ["TF_XLA_FLAGS"] = "--tf_xla_enable_lazy_compilation=false" #causes memory fragmentation for bert leading to OOM tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) dllogging = utils.dllogger_class.dllogger_class(FLAGS.dllog_path) if not FLAGS.do_train and not FLAGS.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") # Set seed to reduce randomness random.seed(FLAGS.seed) np.random.seed(FLAGS.seed) tf.set_random_seed(FLAGS.seed) if FLAGS.herring: import smdistributed.dataparallel.tensorflow as hvd hvd.init() bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) tf.io.gfile.makedirs(FLAGS.output_dir) input_files = [] for input_file_dir in FLAGS.input_files_dir.split(","): input_files.extend(tf.io.gfile.glob(os.path.join(input_file_dir, "*"))) if FLAGS.herring and len(input_files) < hvd.size(): raise ValueError("Input Files must be sharded") if FLAGS.amp and FLAGS.manual_fp16: raise ValueError("AMP and Manual Mixed Precision Training are both activated! Error") is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 config = tf.compat.v1.ConfigProto() if FLAGS.herring: config.gpu_options.visible_device_list = str(hvd.local_rank()) if hvd.rank() == 0: tf.compat.v1.logging.info("***** Configuaration *****") for key in FLAGS.__flags.keys(): tf.compat.v1.logging.info(' {}: {}'.format(key, getattr(FLAGS, key))) tf.compat.v1.logging.info("**************************") # config.gpu_options.per_process_gpu_memory_fraction = 0.7 if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1 config.graph_options.rewrite_options.memory_optimization = rewriter_config_pb2.RewriterConfig.NO_MEM_OPT if FLAGS.amp: tf.enable_resource_variables() run_config = tf.estimator.RunConfig( tf_random_seed=(FLAGS.seed if not FLAGS.herring else (FLAGS.seed + hvd.rank())), model_dir=FLAGS.output_dir, session_config=config, save_checkpoints_steps=FLAGS.save_checkpoints_steps if not FLAGS.herring or hvd.rank() == 0 else None, save_summary_steps=FLAGS.save_checkpoints_steps if not FLAGS.herring or hvd.rank() == 0 else None, # This variable controls how often estimator reports examples/sec. # Default value is every 100 steps. # When --report_loss is True, we set to very large value to prevent # default info reporting from estimator. # Ideally we should set it to None, but that does not work. log_step_count_steps=10000 if FLAGS.report_loss else 100) model_fn = model_fn_builder( bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate if not FLAGS.herring else FLAGS.learning_rate*hvd.size(), num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps, use_one_hot_embeddings=False, hvd=None if not FLAGS.herring else hvd) estimator = tf.estimator.Estimator( model_fn=model_fn, config=run_config) if FLAGS.do_train: training_hooks = [] if FLAGS.herring and hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) if (not FLAGS.herring or hvd.rank() == 0): global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps if not FLAGS.herring else FLAGS.train_batch_size * FLAGS.num_accumulation_steps * hvd.size() training_hooks.append(_LogSessionRunHook(global_batch_size, FLAGS.num_accumulation_steps, dllogging, FLAGS.display_loss_steps, FLAGS.save_checkpoints_steps, FLAGS.report_loss)) tf.compat.v1.logging.info("***** Running training *****") tf.compat.v1.logging.info(" Batch size = %d", FLAGS.train_batch_size) train_input_fn = input_fn_builder( input_files=input_files, batch_size=FLAGS.train_batch_size, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=True, hvd=None if not FLAGS.herring else hvd) train_start_time = time.time() estimator.train(input_fn=train_input_fn, hooks=training_hooks, max_steps=FLAGS.num_train_steps) train_time_elapsed = time.time() - train_start_time if (not FLAGS.herring or hvd.rank() == 0): train_time_wo_overhead = training_hooks[-1].total_time avg_sentences_per_second = FLAGS.num_train_steps * global_batch_size * 1.0 / train_time_elapsed ss_sentences_per_second = (FLAGS.num_train_steps - training_hooks[-1].skipped) * global_batch_size * 1.0 / train_time_wo_overhead tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info("Total Training Time = %0.2f for Sentences = %d", train_time_elapsed, FLAGS.num_train_steps * global_batch_size) tf.compat.v1.logging.info("Total Training Time W/O Overhead = %0.2f for Sentences = %d", train_time_wo_overhead, (FLAGS.num_train_steps - training_hooks[-1].skipped) * global_batch_size) tf.compat.v1.logging.info("Training Throughput Average (sentences/sec) with overhead = %0.2f", avg_sentences_per_second) tf.compat.v1.logging.info("Training Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) dllogging.logger.log(step=(), data={"throughput_train": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT) tf.compat.v1.logging.info("-----------------------------") if FLAGS.do_eval and (not FLAGS.herring or hvd.rank() == 0): tf.compat.v1.logging.info("***** Running evaluation *****") tf.compat.v1.logging.info(" Batch size = %d", FLAGS.eval_batch_size) eval_files = [] for eval_file_dir in FLAGS.eval_files_dir.split(","): eval_files.extend(tf.io.gfile.glob(os.path.join(eval_file_dir, "*"))) eval_input_fn = input_fn_builder( input_files=eval_files, batch_size=FLAGS.eval_batch_size, max_seq_length=FLAGS.max_seq_length, max_predictions_per_seq=FLAGS.max_predictions_per_seq, is_training=False, hvd=None if not FLAGS.herring else hvd) eval_hooks = [LogEvalRunHook(FLAGS.eval_batch_size)] eval_start_time = time.time() result = estimator.evaluate( input_fn=eval_input_fn, steps=FLAGS.max_eval_steps, hooks=eval_hooks) eval_time_elapsed = time.time() - eval_start_time time_list = eval_hooks[-1].time_list time_list.sort() # Removing outliers (init/warmup) in throughput computation. eval_time_wo_overhead = sum(time_list[:int(len(time_list) * 0.99)]) num_sentences = (int(len(time_list) * 0.99)) * FLAGS.eval_batch_size ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead tf.compat.v1.logging.info("-----------------------------") tf.compat.v1.logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed, eval_hooks[-1].count * FLAGS.eval_batch_size) tf.compat.v1.logging.info("Total Inference Time W/O Overhead = %0.2f for Sentences = %d", eval_time_wo_overhead, num_sentences) tf.compat.v1.logging.info("Summary Inference Statistics on EVAL set") tf.compat.v1.logging.info("Batch size = %d", FLAGS.eval_batch_size) tf.compat.v1.logging.info("Sequence Length = %d", FLAGS.max_seq_length) tf.compat.v1.logging.info("Precision = %s", "fp16" if FLAGS.amp else "fp32") tf.compat.v1.logging.info("Inference Throughput Average (sentences/sec) = %0.2f", ss_sentences_per_second) dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT) tf.compat.v1.logging.info("-----------------------------") output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") with tf.io.gfile.GFile(output_eval_file, "w") as writer: tf.compat.v1.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.compat.v1.logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def get_dataset_from_tfrecords( *, model_type: str, filenames: List[str], per_gpu_batch_size: int, max_seq_length: int, max_predictions_per_seq: int = None, buffer_size: int = 1000, shard: bool = True, ) -> "tf.data.Dataset": """ Reads the dataset from TFRecords and returns it. Returns a dataset that includes batching, but not gradient accumulation. """ def _parse_function(example_proto): # Parse the input `tf.Example` proto using the dictionary above. return tf.io.parse_single_example(example_proto, name_to_features) if model_type in ["albert", "bert"]: assert max_predictions_per_seq is not None, "Pass --max_predictions_per_seq" name_to_features = { "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), # corresponds to input_ids "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), # corresponds to attention_mask "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), # corresponds to token_type_ids "masked_lm_positions": tf.io.FixedLenFeature( [max_predictions_per_seq], tf.int64 ), # The number in the sequence that is masked, in range [0, max_seq_length]. 0 signifies a pad. "masked_lm_ids": tf.io.FixedLenFeature( [max_predictions_per_seq], tf.int64 ), # The token id that is masked, in range [0, vocab_size]. 0 signifies a pad. "masked_lm_weights": tf.io.FixedLenFeature( [max_predictions_per_seq], tf.float32), # 1 if useful, 0 signifies a pad token "next_sentence_labels": tf.io.FixedLenFeature([1], tf.int64), } elif model_type in ["electra"]: name_to_features = { "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), # corresponds to input_ids "token_type_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64), # corresponds to token_type_ids "attention_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64), # corresponds to attention_mask } else: raise ValueError( f"model_type={model_type} must be one of ['albert', 'bert', 'electra']" ) # Example input pipeline here: https://github.com/NVIDIA/DeepLearningExamples/blob/master/TensorFlow/LanguageModeling/BERT/run_pretraining.py#L443 assert len(filenames) > 0, f"Filenames is an empty list" # Shard and shuffle the filenames dataset = tf.data.Dataset.from_tensor_slices(filenames) if shard: import smdistributed.dataparallel.tensorflow as smddp dataset = dataset.shard(smddp.size(), smddp.rank()) dataset = dataset.shuffle(buffer_size=len(filenames), reshuffle_each_iteration=True) dataset = dataset.repeat() # `cycle_length` is the number of parallel files that get read num_cpu_threads = 2 * 96 cycle_length = min(num_cpu_threads, len(filenames)) # file_to_dataset_func = lambda file: tf.data.TFRecordDataset(file).map(_parse_function) file_to_dataset_func = lambda file: tf.data.TFRecordDataset(file) dataset = dataset.interleave( file_to_dataset_func, cycle_length=cycle_length, block_length=1, num_parallel_calls=cycle_length, ) # Map and batch will be automatically fused together, see https://www.tensorflow.org/api_docs/python/tf/data/experimental/map_and_batch dataset = dataset.map(_parse_function, num_parallel_calls=num_cpu_threads) dataset = dataset.shuffle(buffer_size=buffer_size, reshuffle_each_iteration=True) dataset = dataset.batch(per_gpu_batch_size, drop_remainder=True) # Shuffle the batches and prefetch some batches dataset = dataset.shuffle(buffer_size=buffer_size, reshuffle_each_iteration=True) return dataset
def main(args): # Hyper-parameters epochs = args.epochs lr = args.learning_rate batch_size = args.batch_size momentum = args.momentum weight_decay = args.weight_decay optimizer = args.optimizer model_type = args.model_type # SageMaker options training_dir = args.train validation_dir = args.validation eval_dir = args.eval # Change: Initialize SMDataParallel and get the size of the cluster smdp.init() size = smdp.size() # Change: Pin GPU to local process (one GPU per process) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: # SMDataParallel: Pin GPUs to a single SMDataParallel process [use SMDataParallel local_rank() API] tf.config.experimental.set_visible_devices(gpus[smdp.local_rank()], 'GPU') # Get dataset train_dataset = get_dataset(training_dir + '/train.tfrecords', batch_size) train_dataset = train_dataset.take(NUM_TRAIN_IMAGES // size).shuffle(10000) val_dataset = get_dataset(validation_dir + '/validation.tfrecords', batch_size) eval_dataset = get_dataset(eval_dir + '/eval.tfrecords', batch_size) # Load model model = get_model(model_type) # Optimizer if optimizer.lower() == 'adam': opt = Adam(lr=lr * size, decay=weight_decay) elif optimizer.lower() == 'rmsprop': opt = RMSprop(lr=lr * size, decay=weight_decay) else: opt = SGD(lr=lr * size, decay=weight_decay, momentum=momentum) # Loss function loss = tf.keras.losses.CategoricalCrossentropy() # Metrics to track train_loss = tf.keras.metrics.Mean(name='train_loss') train_accuracy = tf.keras.metrics.CategoricalAccuracy( name='train_accuracy') val_loss = tf.keras.metrics.Mean(name='val_loss') val_accuracy = tf.keras.metrics.CategoricalAccuracy(name='val_accuracy') test_loss = tf.keras.metrics.Mean(name='test_loss') test_accuracy = tf.keras.metrics.CategoricalAccuracy(name='test_accuracy') # Training step @tf.function def training_step(images, labels, first_batch): with tf.GradientTape() as tape: train_pred = model(images, training=True) loss_value = loss(labels, train_pred) # Change: Wrap tf.GradientTape with SMDataParallel's DistributedGradientTape tape = smdp.DistributedGradientTape(tape) grads = tape.gradient(loss_value, model.trainable_variables) opt.apply_gradients(zip(grads, model.trainable_variables)) if first_batch: # Change: Broadcast model and optimizer variables smdp.broadcast_variables(model.variables, root_rank=0) smdp.broadcast_variables(opt.variables(), root_rank=0) # Change: all_reduce call train_loss_value = smdp.oob_allreduce( loss_value) # Average the loss across workers train_loss(train_loss_value) train_accuracy(labels, train_pred) return # Test step @tf.function def test_step(images, labels): val_pred = model(images, training=False) val_loss_value = loss(labels, val_pred) val_loss(val_loss_value) val_accuracy(labels, val_pred) return if smdp.rank() == 0: tb_log_dir = '/opt/ml/output/tensorboard/' train_summary_writer = tf.summary.create_file_writer(tb_log_dir) test_summary_writer = tf.summary.create_file_writer(tb_log_dir) # Training loop for epoch in range(epochs): train_loss.reset_states() train_accuracy.reset_states() val_loss.reset_states() val_accuracy.reset_states() for batch, (images, labels) in enumerate(train_dataset): start_time = time.time() training_step(images, labels, batch == 0) epoch_time = time.time() - start_time for images, labels in val_dataset: test_step(images, labels) if smdp.rank() == 0: with train_summary_writer.as_default(): tf.summary.scalar('train_loss', train_loss.result(), step=epoch) tf.summary.scalar('train_accuracy', train_accuracy.result(), step=epoch) with test_summary_writer.as_default(): tf.summary.scalar('val_loss', val_loss.result(), step=epoch) tf.summary.scalar('val_accuracy', val_accuracy.result(), step=epoch) print( f'Epoch: {epoch + 1}, ' f'Epoch duration: {epoch_time} sec, ' f'Training loss: {train_loss.result()}, ' f'Training accuracy: {train_accuracy.result() * 100}', f'Validation Loss: {val_loss.result()}, ' f'Validation Accuracy: {val_accuracy.result() * 100}') for images, labels in eval_dataset: test_pred = model(images, training=False) test_loss_value = loss(labels, test_pred) test_loss(test_loss_value) test_accuracy(labels, test_pred) print('====== Test Results ======') print(f'Test loss: {test_loss.result()}, ' f'Test accuracy: {test_accuracy.result() * 100}') print('====== End of training ======') # Change: Save checkpoints only from master node. if smdp.rank() == 0: model.save(os.path.join(os.environ["SM_MODEL_DIR"], '1'))
dataset = dataset.repeat().shuffle(10000).batch(128) mnist_model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), tf.keras.layers.Dropout(0.25), tf.keras.layers.Flatten(), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(10, activation='softmax') ]) loss = tf.losses.SparseCategoricalCrossentropy() # LR for 8 node run : 0.000125 # LR for single node run : 0.001 opt = tf.optimizers.Adam(0.000125 * dist.size()) checkpoint_dir = './checkpoints' checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt) @tf.function def training_step(images, labels, first_batch): with tf.GradientTape() as tape: probs = mnist_model(images, training=True) loss_value = loss(labels, probs) tape = dist.DistributedGradientTape(tape) grads = tape.gradient(loss_value, mnist_model.trainable_variables) opt.apply_gradients(zip(grads, mnist_model.trainable_variables))
mnist_model = tf.keras.Sequential( [ tf.keras.layers.Conv2D(32, [3, 3], activation="relu"), tf.keras.layers.Conv2D(64, [3, 3], activation="relu"), tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), tf.keras.layers.Dropout(0.25), tf.keras.layers.Flatten(), tf.keras.layers.Dense(128, activation="relu"), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(10, activation="softmax"), ] ) loss = tf.losses.SparseCategoricalCrossentropy() opt = tf.optimizers.Adam(0.001 * smdataparallel.size()) checkpoint_dir = "/tmp/checkpoints" checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt) def training_step(images, labels, first_batch): with tf.GradientTape() as tape: probs = mnist_model(images, training=True) loss_value = loss(labels, probs) # Create a new DistributedGradientTape, which uses TensorFlow’s GradientTape under the hood, # using an AllReduce to combine gradient values before applying gradients to model weights. tape = smdataparallel.DistributedGradientTape(tape) grads = tape.gradient(loss_value, mnist_model.trainable_variables)
def train(self, iter_unit, num_iter, run_iter, batch_size, warmup_steps=50, weight_decay=1e-4, lr_init=0.1, lr_warmup_epochs=5, momentum=0.9, log_every_n_steps=1, loss_scale=256, label_smoothing=0.0, mixup=0.0, use_cosine_lr=False, use_static_loss_scaling=False, is_benchmark=False, quantize=False, symmetric=False, quant_delay=0, finetune_checkpoint=None, use_final_conv=False, use_qdq=False): if iter_unit not in ["epoch", "batch"]: raise ValueError( '`iter_unit` value is unknown: %s (allowed: ["epoch", "batch"])' % iter_unit) if self.run_hparams.data_dir is None and not is_benchmark: raise ValueError('`data_dir` must be specified for training!') if self.run_hparams.use_tf_amp or self.run_hparams.dtype == tf.float16: if use_static_loss_scaling: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "0" else: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_LOSS_SCALING"] = "1" else: use_static_loss_scaling = False # Make sure it hasn't been set to True on FP32 training num_gpus = 1 if not hvd_utils.is_using_hvd() else hvd.size() global_batch_size = batch_size * num_gpus if self.run_hparams.data_dir is not None: filenames, num_samples, num_steps, num_epochs, num_decay_steps = runner_utils.parse_tfrecords_dataset( data_dir=self.run_hparams.data_dir, mode="train", iter_unit=iter_unit, num_iter=num_iter, global_batch_size=global_batch_size, ) steps_per_epoch = num_steps / num_epochs else: num_epochs = 1 num_steps = num_iter steps_per_epoch = num_steps num_decay_steps = num_steps num_samples = num_steps * batch_size if run_iter == -1: run_iter = num_steps else: run_iter = steps_per_epoch * run_iter if iter_unit == "epoch" else run_iter if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None: idx_filenames = runner_utils.parse_dali_idx_dataset( data_idx_dir=self.run_hparams.data_idx_dir, mode="train") training_hooks = [] if hvd.rank() == 0: print('Starting Model Training...') print("Training Epochs", num_epochs) print("Total Steps", num_steps) print("Steps per Epoch", steps_per_epoch) print("Decay Steps", num_decay_steps) print("Weight Decay Factor", weight_decay) print("Init Learning Rate", lr_init) print("Momentum", momentum) print("Num GPUs", num_gpus) print("Per-GPU Batch Size", batch_size) if is_benchmark: self.training_logging_hook = hooks.BenchmarkLoggingHook( global_batch_size=global_batch_size, warmup_steps=warmup_steps) else: self.training_logging_hook = hooks.TrainingLoggingHook( global_batch_size=global_batch_size, num_steps=num_steps, num_samples=num_samples, num_epochs=num_epochs, steps_per_epoch=steps_per_epoch) training_hooks.append(self.training_logging_hook) if hvd_utils.is_using_hvd(): bcast_hook = hvd.BroadcastGlobalVariablesHook(0) training_hooks.append(bcast_hook) training_hooks.append(hooks.PrefillStagingAreasHook()) training_hooks.append(hooks.TrainingPartitionHook()) estimator_params = { 'batch_size': batch_size, 'steps_per_epoch': steps_per_epoch, 'num_gpus': num_gpus, 'momentum': momentum, 'lr_init': lr_init, 'lr_warmup_epochs': lr_warmup_epochs, 'weight_decay': weight_decay, 'loss_scale': loss_scale, 'apply_loss_scaling': use_static_loss_scaling, 'label_smoothing': label_smoothing, 'mixup': mixup, 'num_decay_steps': num_decay_steps, 'use_cosine_lr': use_cosine_lr, 'use_final_conv': use_final_conv, 'quantize': quantize, 'use_qdq': use_qdq, 'symmetric': symmetric, 'quant_delay': quant_delay } if finetune_checkpoint: estimator_params['finetune_checkpoint'] = finetune_checkpoint image_classifier = self._get_estimator( mode='train', run_params=estimator_params, use_xla=self.run_hparams.use_xla, use_dali=self.run_hparams.use_dali, gpu_memory_fraction=self.run_hparams.gpu_memory_fraction, gpu_id=self.run_hparams.gpu_id) def training_data_fn(): if self.run_hparams.use_dali and self.run_hparams.data_idx_dir is not None: if hvd.rank() == 0: print("Using DALI input... ") return data_utils.get_dali_input_fn( filenames=filenames, idx_filenames=idx_filenames, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=True, distort_color=self.run_hparams.distort_colors, num_threads=self.run_hparams.num_preprocessing_threads, deterministic=False if self.run_hparams.seed is None else True) elif self.run_hparams.data_dir is not None: return data_utils.get_tfrecords_input_fn( filenames=filenames, batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, training=True, distort_color=self.run_hparams.distort_colors, num_threads=self.run_hparams.num_preprocessing_threads, deterministic=False if self.run_hparams.seed is None else True) else: if hvd.rank() == 0: print("Using Synthetic Data ...") return data_utils.get_synth_input_fn( batch_size=batch_size, height=self.run_hparams.height, width=self.run_hparams.width, num_channels=self.run_hparams.n_channels, data_format=self.run_hparams.input_format, num_classes=self.run_hparams.n_classes, dtype=self.run_hparams.dtype, ) try: current_step = image_classifier.get_variable_value("global_step") except ValueError: current_step = 0 run_iter = max(0, min(run_iter, num_steps - current_step)) print("Current step:", current_step) if run_iter > 0: try: image_classifier.train( input_fn=training_data_fn, steps=run_iter, hooks=training_hooks, ) except KeyboardInterrupt: print("Keyboard interrupt") if hvd.rank() == 0: if run_iter > 0: print('Ending Model Training ...') train_throughput = self.training_logging_hook.mean_throughput.value( ) train_time = self.training_logging_hook.train_time dllogger.log(data={'train_throughput': train_throughput}, step=tuple()) dllogger.log(data={'Total Training time': train_time}, step=tuple()) else: print( 'Model already trained required number of steps. Skipped')
def is_using_hvd(): return hvd.size() > 1
def main(): parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments, LoggingArguments, PathArguments) ) ( model_args, data_args, train_args, log_args, path_args, remaining_strings, ) = parser.parse_args_into_dataclasses(return_remaining_strings=True) # SageMaker may have some extra strings. TODO: Test this on SM. assert len(remaining_strings) == 0, f"The args {remaining_strings} could not be parsed." tf.random.set_seed(train_args.seed) tf.autograph.set_verbosity(0) # Settings init parse_bool = lambda arg: arg == "true" do_gradient_accumulation = train_args.gradient_accumulation_steps > 1 do_xla = not parse_bool(train_args.skip_xla) do_eager = parse_bool(train_args.eager) skip_sop = parse_bool(train_args.skip_sop) skip_mlm = parse_bool(train_args.skip_mlm) pre_layer_norm = parse_bool(model_args.pre_layer_norm) fast_squad = parse_bool(log_args.fast_squad) dummy_eval = parse_bool(log_args.dummy_eval) is_sagemaker = path_args.filesystem_prefix.startswith("/opt/ml") disable_tqdm = is_sagemaker global max_grad_norm max_grad_norm = train_args.max_grad_norm # TODO : Change to obfuscate smddpcommon. This code does not use GradientTape, so need to pass it like this. if train_args.bucket_cap_mb: bucket_cap_bytes = int(train_args.bucket_cap_mb * 1024 * 1024) else: bucket_cap_bytes = int(64 * 1024 * 1024) hc.setBucketSize(bucket_cap_bytes) gpus = tf.config.list_physical_devices("GPU") for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.set_visible_devices(gpus[smddp.local_rank()], "GPU") # XLA, AutoGraph tf.config.optimizer.set_jit(do_xla) tf.config.experimental_run_functions_eagerly(do_eager) if smddp.rank() == 0: # Run name should only be used on one process to avoid race conditions current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") platform = "sm" if is_sagemaker else "eks" if skip_sop: loss_str = "-skipsop" elif skip_mlm: loss_str = "-skipmlm" else: loss_str = "" if log_args.run_name is None: metadata = ( f"{model_args.model_type}" f"-{model_args.model_size}" f"-{model_args.load_from}" f"-{smddp.size()}gpus" f"-{train_args.per_gpu_batch_size * smddp.size() * train_args.gradient_accumulation_steps}globalbatch" f"-{train_args.learning_rate}maxlr" f"-{train_args.learning_rate_decay_power}power" f"-{train_args.optimizer}opt" f"-{train_args.total_steps}steps" f"-{'preln' if pre_layer_norm else 'postln'}" f"{loss_str}" f"-{model_args.hidden_dropout_prob}dropout" ) run_name = f"{current_time}-{platform}-{metadata}-{train_args.name if train_args.name else 'unnamed'}" else: run_name = log_args.run_name # Logging should only happen on a single process # https://stackoverflow.com/questions/9321741/printing-to-screen-and-writing-to-a-file-at-the-same-time level = logging.INFO format = "%(asctime)-15s %(name)-12s: %(levelname)-8s %(message)s" if not os.path.exists(path_args.log_dir): os.makedirs(path_args.log_dir) handlers = [ logging.FileHandler( os.path.join(path_args.filesystem_prefix, path_args.log_dir, f"{run_name}.log") ), TqdmLoggingHandler(), ] logging.basicConfig(level=level, format=format, handlers=handlers) # Check that arguments passed in properly, only after registering the alert_func and logging assert not (skip_sop and skip_mlm), "Cannot use --skip_sop and --skip_mlm" wrap_global_functions(do_gradient_accumulation) # Create optimizer and enable AMP loss scaling. if train_args.optimizer == "lamb": optimizer = get_lamb_optimizer(train_args) elif train_args.optimizer == "adamw": optimizer = get_adamw_optimizer(train_args) if _PRE_TF_2_4_0: optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale="dynamic" ) else: optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer) gradient_accumulator = GradientAccumulator() loaded_optimizer_weights = None model = create_model(model_class=TFAutoModelForPreTraining, model_args=model_args) tokenizer = create_tokenizer(model_args.model_type) if model_args.load_from == "checkpoint": checkpoint_path = os.path.join(path_args.filesystem_prefix, model_args.checkpoint_path) model_ckpt, optimizer_ckpt = get_checkpoint_paths_from_prefix(checkpoint_path) if smddp.rank() == 0: model.load_weights(model_ckpt) if model_args.load_optimizer_state == "true": loaded_optimizer_weights = np.load(optimizer_ckpt, allow_pickle=True) # We do not set the weights yet, we have to do a first step to initialize the optimizer. # Train filenames are [1, 2047], Val filenames are [0]. Note the different subdirectories # Move to same folder structure and remove if/else train_glob = os.path.join(path_args.filesystem_prefix, path_args.train_dir, "*.tfrecord") validation_glob = os.path.join(path_args.filesystem_prefix, path_args.val_dir, "*.tfrecord") train_filenames = glob.glob(train_glob) validation_filenames = glob.glob(validation_glob) train_dataset = get_dataset_from_tfrecords( model_type=model_args.model_type, filenames=train_filenames, max_seq_length=data_args.max_seq_length, max_predictions_per_seq=data_args.max_predictions_per_seq, per_gpu_batch_size=train_args.per_gpu_batch_size, ) # Of shape [per_gpu_batch_size, ...] # Batch of batches, helpful for gradient accumulation. Shape [grad_steps, per_gpu_batch_size, ...] train_dataset = train_dataset.batch(train_args.gradient_accumulation_steps) # One iteration with 10 dupes, 8 nodes seems to be 60-70k steps. train_dataset = train_dataset.prefetch(buffer_size=8) # Validation should only be done on one node, since Horovod doesn't allow allreduce on a subset of ranks if smddp.rank() == 0: validation_dataset = get_dataset_from_tfrecords( model_type=model_args.model_type, filenames=validation_filenames, max_seq_length=data_args.max_seq_length, max_predictions_per_seq=data_args.max_predictions_per_seq, per_gpu_batch_size=train_args.per_gpu_batch_size, ) # validation_dataset = validation_dataset.batch(1) validation_dataset = validation_dataset.prefetch(buffer_size=8) pbar = tqdm.tqdm(total=train_args.total_steps, disable=disable_tqdm) summary_writer = None # Only create a writer if we make it through a successful step logger.info(f"Starting training, job name {run_name}") i = 1 start_time = time.perf_counter() train_start_time = time.perf_counter() for batch in train_dataset: learning_rate = optimizer.learning_rate(step=tf.constant(i, dtype=tf.float32)) # weight_decay = wd_schedule(step=tf.constant(i, dtype=tf.float32)) loss_scale = optimizer.loss_scale() if _PRE_TF_2_4_0 else optimizer.loss_scale loss, mlm_loss, mlm_acc, sop_loss, sop_acc, grad_norm, weight_norm = train_step( model=model, optimizer=optimizer, gradient_accumulator=gradient_accumulator, batch=batch, gradient_accumulation_steps=train_args.gradient_accumulation_steps, skip_sop=skip_sop, skip_mlm=skip_mlm, ) # Don't want to wrap broadcast_variables() in a tf.function, can lead to asynchronous errors if i == 1: if smddp.rank() == 0 and loaded_optimizer_weights is not None: optimizer.set_weights(loaded_optimizer_weights) print (" RANK {} is broadcasting".format(smddp.rank())) #smddp.broadcast_variables(model.variables + optimizer.variables(), root_rank=0) smddp.broadcast_variables(model.variables, root_rank=0) smddp.broadcast_variables(optimizer.variables(), root_rank=0) print(" RANK {} is done broadcasting".format(smddp.rank())) # smddp.broadcast_variables(optimizer.variables(), root_rank=0) i = optimizer.get_weights()[0] is_final_step = i >= train_args.total_steps do_squad = (log_args.squad_frequency != 0) and ( (i % log_args.squad_frequency == 0) or is_final_step ) # Squad requires all the ranks to train, but results are only returned on rank 0 if do_squad: from albert.run_squad import get_squad_results_while_pretraining squad_results = get_squad_results_while_pretraining( model=model, tokenizer=tokenizer, model_size=model_args.model_size, filesystem_prefix=path_args.filesystem_prefix, step=i, dataset=data_args.squad_version, fast=log_args.fast_squad, dummy_eval=log_args.dummy_eval, ) if smddp.rank() == 0: squad_exact, squad_f1 = squad_results["exact"], squad_results["f1"] logger.info(f"SQuAD step {i} -- F1: {squad_f1:.3f}, Exact: {squad_exact:.3f}") # Re-wrap autograph so it doesn't get arg mismatches wrap_global_functions(do_gradient_accumulation) gc.collect() if smddp.rank() == 0: do_log = i % log_args.log_frequency == 0 do_checkpoint = (log_args.checkpoint_frequency != 0) and ( (i % log_args.checkpoint_frequency == 0) or is_final_step ) do_validation = (log_args.validation_frequency != 0) and ( (i % log_args.validation_frequency == 0) or is_final_step ) pbar.update(1) description = f"Loss: {loss:.3f}, MLM: {mlm_loss:.3f}, SOP: {sop_loss:.3f}, MLM_acc: {mlm_acc:.3f}, SOP_acc: {sop_acc:.3f}" pbar.set_description(description) if do_log: elapsed_time = time.perf_counter() - start_time if i == 1: logger.info(f"First step: {elapsed_time:.3f} secs") elif is_final_step: total_time = time.perf_counter() - train_start_time seq_per_sec = i * train_args.per_gpu_batch_size * smddp.size() * train_args.gradient_accumulation_steps / total_time logger.info(f"Final step {i}: {description} -- Average seq_per_sec: {seq_per_sec:.2f} -- Total Time: {total_time}") else: it_per_sec = log_args.log_frequency / elapsed_time logger.info(f"Train step {i} -- {description} -- It/s: {it_per_sec:.2f}") start_time = time.perf_counter() if do_checkpoint: checkpoint_prefix = os.path.join( path_args.filesystem_prefix, path_args.checkpoint_dir, f"{run_name}-step{i}" ) model_ckpt = f"{checkpoint_prefix}.ckpt" optimizer_ckpt = f"{checkpoint_prefix}-optimizer.npy" logger.info(f"Saving model at {model_ckpt}, optimizer at {optimizer_ckpt}") model.save_weights(model_ckpt) # model.load_weights(model_ckpt) optimizer_weights = optimizer.get_weights() np.save(optimizer_ckpt, optimizer_weights) # optimizer.set_weights(optimizer_weights) if do_validation: val_loss, val_mlm_loss, val_mlm_acc, val_sop_loss, val_sop_acc = run_validation( model=model, validation_dataset=validation_dataset, skip_sop=skip_sop, skip_mlm=skip_mlm, ) description = f"Loss: {val_loss:.3f}, MLM: {val_mlm_loss:.3f}, SOP: {val_sop_loss:.3f}, MLM_acc: {val_mlm_acc:.3f}, SOP_acc: {val_sop_acc:.3f}" logger.info(f"Validation step {i} -- {description}") # Create summary_writer after the first step if summary_writer is None: summary_writer = tf.summary.create_file_writer( os.path.join(path_args.filesystem_prefix, path_args.log_dir, run_name) ) config = { **asdict(model_args), **asdict(data_args), **asdict(train_args), **asdict(log_args), "global_batch_size": train_args.per_gpu_batch_size * smddp.size(), } if is_wandb_available(): wandb.init(config=config, project=model_args.model_type) wandb.run.save() wandb_run_name = wandb.run.name train_metrics = { "weight_norm": weight_norm, "grad_norm": grad_norm, "loss_scale": loss_scale, "learning_rate": learning_rate, "train/loss": loss, "train/mlm_loss": mlm_loss, "train/mlm_acc": mlm_acc, "train/sop_loss": sop_loss, "train/sop_acc": sop_acc, } all_metrics = {**train_metrics} if do_validation: val_metrics = { "val/loss": val_loss, "val/mlm_loss": val_mlm_loss, "val/mlm_acc": val_mlm_acc, "val/sop_loss": val_sop_loss, "val/sop_acc": val_sop_acc, } all_metrics = {**all_metrics, **val_metrics} if do_squad: squad_metrics = { "squad/f1": squad_f1, "squad/exact": squad_exact, } all_metrics = {**all_metrics, **squad_metrics} # Log to TensorBoard with summary_writer.as_default(): for name, val in all_metrics.items(): tf.summary.scalar(name, val, step=i) # Log to Weights & Biases if is_wandb_available(): wandb.log({"step": i, **all_metrics}) i += 1 if is_final_step: break if smddp.rank() == 0: pbar.close() logger.info(f"Finished pretraining, job name {run_name}")
def __init__( self, # ========= Model HParams ========= # n_classes=1001, architecture='resnet50', input_format='NHWC', # NCHW or NHWC compute_format='NCHW', # NCHW or NHWC dtype=tf.float32, # tf.float32 or tf.float16 n_channels=3, height=224, width=224, distort_colors=False, model_dir=None, log_dir=None, data_dir=None, data_idx_dir=None, weight_init="fan_out", # ======= Optimization HParams ======== # use_xla=False, use_tf_amp=False, use_dali=False, gpu_memory_fraction=1.0, gpu_id=0, # ======== Debug Flags ======== # debug_verbosity=0, seed=None): if dtype not in [tf.float32, tf.float16]: raise ValueError( "Unknown dtype received: %s (allowed: `tf.float32` and `tf.float16`)" % dtype) if compute_format not in ["NHWC", 'NCHW']: raise ValueError( "Unknown `compute_format` received: %s (allowed: ['NHWC', 'NCHW'])" % compute_format) if input_format not in ["NHWC", 'NCHW']: raise ValueError( "Unknown `input_format` received: %s (allowed: ['NHWC', 'NCHW'])" % input_format) if n_channels not in [1, 3]: raise ValueError( "Unsupported number of channels: %d (allowed: 1 (grayscale) and 3 (color))" % n_channels) tf_seed = 2 * (seed + hvd.rank()) if seed is not None else None # ============================================ # Optimsation Flags - Do not remove # ============================================ os.environ['CUDA_CACHE_DISABLE'] = '0' os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL' #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private' os.environ['TF_GPU_THREAD_COUNT'] = '1' if not hvd_utils.is_using_hvd( ) else str(hvd.size()) os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1' os.environ['TF_ADJUST_HUE_FUSED'] = '1' os.environ['TF_ADJUST_SATURATION_FUSED'] = '1' os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' os.environ['TF_SYNC_ON_FINISH'] = '0' os.environ['TF_AUTOTUNE_THRESHOLD'] = '2' os.environ['TF_DISABLE_NVTX_RANGES'] = '1' os.environ["TF_XLA_FLAGS"] = ( os.environ.get("TF_XLA_FLAGS", "") + " --tf_xla_enable_lazy_compilation=false") # ============================================ # TF-AMP Setup - Do not remove # ============================================ if dtype == tf.float16: if use_tf_amp: raise RuntimeError( "TF AMP can not be activated for FP16 precision") elif use_tf_amp: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" else: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "0" # ================================================= model_hparams = tf.contrib.training.HParams( width=height, height=width, n_channels=n_channels, n_classes=n_classes, dtype=dtype, input_format=input_format, compute_format=compute_format, distort_colors=distort_colors, seed=tf_seed) num_preprocessing_threads = 10 if not use_dali else 4 run_config_performance = tf.contrib.training.HParams( num_preprocessing_threads=num_preprocessing_threads, use_tf_amp=use_tf_amp, use_xla=use_xla, use_dali=use_dali, gpu_memory_fraction=gpu_memory_fraction, gpu_id=gpu_id) run_config_additional = tf.contrib.training.HParams( model_dir=model_dir if not hvd_utils.is_using_hvd() or hvd.rank() == 0 else None, log_dir=log_dir if not hvd_utils.is_using_hvd() or hvd.rank() == 0 else None, data_dir=data_dir, data_idx_dir=data_idx_dir, num_preprocessing_threads=num_preprocessing_threads) self.run_hparams = Runner._build_hparams(model_hparams, run_config_additional, run_config_performance) model_name = architecture architecture = resnet.model_architectures[architecture] self._model = resnet.ResnetModel( model_name=model_name, n_classes=model_hparams.n_classes, layers_count=architecture["layers"], layers_depth=architecture["widths"], expansions=architecture["expansions"], input_format=model_hparams.input_format, compute_format=model_hparams.compute_format, dtype=model_hparams.dtype, weight_init=weight_init, use_dali=use_dali, cardinality=architecture['cardinality'] if 'cardinality' in architecture else 1, use_se=architecture['use_se'] if 'use_se' in architecture else False, se_ratio=architecture['se_ratio'] if 'se_ratio' in architecture else 1) if self.run_hparams.seed is not None: np.random.seed(self.run_hparams.seed) tf.set_random_seed(self.run_hparams.seed) self.training_logging_hook = None self.eval_logging_hook = None
def build(input_reader_config, batch_size=None, transform_input_data_fn=None, multi_gpu=True): """Builds a tf.data.Dataset. Builds a tf.data.Dataset by applying the `transform_input_data_fn` on all records. Applies a padded batch to the resulting dataset. Args: input_reader_config: A input_reader_pb2.InputReader object. batch_size: Batch size. If batch size is None, no batching is performed. transform_input_data_fn: Function to apply transformation to all records, or None if no extra decoding is required. Returns: A tf.data.Dataset based on the input_reader_config. Raises: ValueError: On invalid input reader proto. ValueError: If no input paths are specified. """ if not isinstance(input_reader_config, input_reader_pb2.InputReader): raise ValueError('input_reader_config not of type ' 'input_reader_pb2.InputReader.') if input_reader_config.WhichOneof('input_reader') == 'tf_record_input_reader': config = input_reader_config.tf_record_input_reader if not config.input_path: raise ValueError('At least one input path must be specified in ' '`input_reader_config`.') label_map_proto_file = None if input_reader_config.HasField('label_map_path'): label_map_proto_file = input_reader_config.label_map_path decoder = tf_example_decoder.TfExampleDecoder( load_instance_masks=input_reader_config.load_instance_masks, instance_mask_type=input_reader_config.mask_type, label_map_proto_file=label_map_proto_file, use_display_name=input_reader_config.use_display_name, num_additional_channels=input_reader_config.num_additional_channels) def process_fn(value): """Sets up tf graph that decodes, transforms and pads input data.""" processed_tensors = decoder.decode(value) if transform_input_data_fn is not None: processed_tensors = transform_input_data_fn(processed_tensors) return processed_tensors dataset = read_dataset( functools.partial(tf.data.TFRecordDataset, buffer_size=8 * 1000 * 1000), config.input_path[:], input_reader_config) if multi_gpu: dataset = dataset.shard(hvd.size(), hvd.rank()) # TODO(rathodv): make batch size a required argument once the old binaries # are deleted. if batch_size: num_parallel_calls = batch_size * input_reader_config.num_parallel_batches else: num_parallel_calls = input_reader_config.num_parallel_map_calls dataset = dataset.map( process_fn, num_parallel_calls=num_parallel_calls) if batch_size: dataset = dataset.apply( tf.contrib.data.batch_and_drop_remainder(batch_size)) dataset = dataset.prefetch(input_reader_config.num_prefetch_batches) return dataset raise ValueError('Unsupported input_reader_config.')