def main(args): # Hyper-parameters epochs = args.epochs lr = args.learning_rate batch_size = args.batch_size momentum = args.momentum weight_decay = args.weight_decay optimizer = args.optimizer # SageMaker options gpu_count = args.gpu_count training_dir = args.train validation_dir = args.validation eval_dir = args.eval tensorboard_logs = args.tensorboard_logs hvd.init() size = hvd.size() # Change 3 - pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) train_dataset = make_batch(training_dir + '/train.tfrecords', batch_size) val_dataset = make_batch(validation_dir + '/validation.tfrecords', batch_size) eval_dataset = make_batch(eval_dir + '/eval.tfrecords', batch_size) input_shape = (HEIGHT, WIDTH, DEPTH) callbacks = [] callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) callbacks.append(hvd.callbacks.MetricAverageCallback()) callbacks.append( hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1)) callbacks.append( tf.keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1)) if hvd.rank() == 0: callbacks.append( ModelCheckpoint(args.output_data_dir + '/checkpoint-{epoch}.h5')) logdir = args.output_data_dir + '/' + datetime.now().strftime( "%Y%m%d-%H%M%S") callbacks.append(TensorBoard(log_dir=logdir, profile_batch=0)) callbacks.append(Sync2S3(logdir=logdir, s3logdir=tensorboard_logs)) model = get_model(lr, weight_decay, optimizer, momentum, hvd) # Train model history = model.fit( x=train_dataset[0], y=train_dataset[1], steps_per_epoch=(NUM_TRAIN_IMAGES // batch_size) // size, validation_data=val_dataset, validation_steps=(NUM_VALID_IMAGES // batch_size) // size, epochs=epochs, callbacks=callbacks) # Evaluate model performance score = model.evaluate(eval_dataset[0], eval_dataset[1], steps=NUM_TEST_IMAGES // args.batch_size, verbose=0) print('Test loss :', score[0]) print('Test accuracy:', score[1]) if hvd.rank() == 0: save_history(args.output_data_dir + "/hvd_history.p", history)
def pipeline(self, dataset: tf.data.Dataset) -> tf.data.Dataset: """Build a pipeline fetching, shuffling, and preprocessing the dataset. Args: dataset: A `tf.data.Dataset` that loads raw files. Returns: A TensorFlow dataset outputting batched images and labels. """ # This can help resolve OOM issues when using only 1 GPU for training options = tf.data.Options() options.experimental_optimization.map_parallelization = ( not self.disable_map_parallelization) dataset = dataset.with_options(options) if self._num_gpus > 1: # For multi-host training, we want each hosts to always process the same # subset of files. Each host only sees a subset of the entire dataset, # allowing us to cache larger datasets in memory. dataset = dataset.shard(self._num_gpus, hvd.rank()) if self.is_training: # Shuffle the input files. dataset.shuffle(buffer_size=self._file_shuffle_buffer_size) if self.is_training and not self._cache: dataset = dataset.repeat() # Read the data from disk in parallel dataset = dataset.interleave( tf.data.TFRecordDataset, cycle_length=10, block_length=1, num_parallel_calls=tf.data.experimental.AUTOTUNE) if self._cache: dataset = dataset.cache() if self.is_training: dataset = dataset.shuffle(self._shuffle_buffer_size) dataset = dataset.repeat() # Parse, pre-process, and batch the data in parallel preprocess = self.parse_record dataset = dataset.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE) if self._num_gpus > 1: # The batch size of the dataset will be multiplied by the number of # replicas automatically when strategy.distribute_datasets_from_function # is called, so we use local batch size here. dataset = dataset.batch(self.local_batch_size, drop_remainder=self.is_training) else: dataset = dataset.batch(self.global_batch_size, drop_remainder=self.is_training) # apply Mixup/CutMix only during training, if requested in the data pipeline, # otherwise they will be applied in the model module on device mixup_alpha = self.mixup_alpha if self.is_training else 0.0 cutmix_alpha = self.cutmix_alpha if self.is_training else 0.0 dataset = dataset.map(functools.partial(mixing, self.local_batch_size, mixup_alpha, cutmix_alpha, self.defer_img_mixing), num_parallel_calls=64) # Assign static batch size dimension # dataset = dataset.map( # functools.partial(self.set_shapes, batch_size), # num_parallel_calls=tf.data.experimental.AUTOTUNE) # Prefetch overlaps in-feed with training dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) return dataset
opt = hvd.DistributedOptimizer(opt) model.compile(loss=keras.losses.sparse_categorical_crossentropy, optimizer=opt, metrics=['accuracy']) callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0), MyThresholdCallback(threshold=0.05) ] # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks.append(keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) start = time() model.fit(train_generator, epochs=epochs, verbose=1, batch_size=batch_size, callbacks=callbacks, validation_data=val_generator) print(f"Total training time: {time() - start} seconds") score = model.evaluate(test_generator, verbose=0)
def test_gradient_aggregation(self): class TestingOptimizer(optimizer_v2.OptimizerV2): """ Custom optimizer we use for testing gradient aggregation. """ def get_config(self): config = super(TestingOptimizer, self).get_config() return config def _create_slots(self, var_list): # Only needed for TF < 2.2. pass def _resource_apply_dense(self, grad, var, apply_state=None): return var.assign_add(grad) backward_passes_per_step = 4 hvd_optimizer = hvd.DistributedOptimizer( optimizer=TestingOptimizer("test"), backward_passes_per_step=backward_passes_per_step, average_aggregated_gradients=True, ) _ = hvd_optimizer.iterations def compute_expected_value(batch_id): sum_per_aggregation = 0.0 for _ in range(backward_passes_per_step): grads_for_batch = 0.0 for rank in range(hvd.size()): grads_for_batch += rank # Apply `average_aggregated_gradients`. grads_for_batch /= float(backward_passes_per_step) # Averages across workers. sum_per_aggregation += grads_for_batch / float(hvd.size()) aggregations_completed = math.floor( (batch_id + 1) / backward_passes_per_step) return aggregations_completed * sum_per_aggregation @tf.function def apply_gradients_in_tf_function(gradient_updates, model_variables, **kwargs): # Apply gradient updates in tf.function to reproduce how it is # done inside `model.fit()`. hvd_optimizer.apply_gradients( zip(gradient_updates, model_variables), **kwargs) gradients = [tf.constant([float(hvd.rank())])] variables = [tf.Variable([0.0])] for idx in range(10): if _PRE_TF_2_2_0: updated_gradients = hvd_optimizer._allreduce( gradients, variables) apply_gradients_in_tf_function(updated_gradients, variables) elif _PRE_TF_2_4_0: # In 2.2 and 2.3 the horovod optimizer sets `_HAS_AGGREGATE_GRAD = True`. # This configures tf.keras to call `_aggregate_gradients()` outside of # `apply_gradients()` and to set `experimental_aggregate_gradients` to # False when calling `apply_gradients()` to prevent it from calling # `_aggregate_gradients()` again. updated_gradients = hvd_optimizer._aggregate_gradients( zip(gradients, variables)) apply_gradients_in_tf_function( updated_gradients, variables, experimental_aggregate_gradients=False) else: raise RuntimeError("This test should be skipped ...") updated_variable_value = variables[0][0].numpy() assert updated_variable_value == compute_expected_value(idx) assert idx + 1 == hvd_optimizer.iterations.numpy()
from tensorflow.keras.preprocessing.image import (ImageDataGenerator, array_to_img, img_to_array, load_img) from tensorflow.keras import applications, optimizers from tensorflow.keras.callbacks import TensorBoard import numpy as np # Horovod: import import horovod.tensorflow.keras as hvd # Horovod: initialize Horovod hvd.init() if hvd.rank() == 0: print('Using Tensorflow version:', tf.__version__, 'Keras version:', tf.keras.__version__, 'backend:', tf.keras.backend.backend()) print('Using Horovod with', hvd.size(), 'workers') # Horovod: pin GPU to be used to process local rank (one GPU per process) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') # ## Data # # The training dataset consists of 2000 images of dogs and cats, split
def image_set(filenames, batch_size, height, width, training=False, distort_color=False, num_threads=10, nsummary=10, deterministic=False, use_dali=None, idx_filenames=None): if use_dali: if idx_filenames is None: raise ValueError("Must provide idx_filenames if Dali is enabled") preprocessor = DALIPreprocessor( filenames, idx_filenames, height, width, batch_size, num_threads, dali_cpu=True if use_dali == 'CPU' else False, deterministic=deterministic, training=training) return preprocessor else: shuffle_buffer_size = 10000 num_readers = 10 ds = tf.data.Dataset.from_tensor_slices(filenames) # AUTOTUNE can give better perf for non-horovod cases thread_config = num_threads # shard should be before any randomizing operations if training: ds = ds.shard(hvd.size(), hvd.rank()) # read up to num_readers files and interleave their records ds = ds.interleave(tf.data.TFRecordDataset, cycle_length=num_readers) if training: # Improve training performance when training data is in remote storage and # can fit into worker memory. ds = ds.cache() if training: # shuffle data before repeating to respect epoch boundaries ds = ds.shuffle(shuffle_buffer_size) ds = ds.repeat() preproc_func = (lambda record: _parse_and_preprocess_image_record( record, height, width, deterministic=deterministic, random_crop=training, distort_color=distort_color)) ds = ds.map(preproc_func, num_parallel_calls=thread_config) ds = ds.batch(batch_size, drop_remainder=True) # prefetching ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) options = tf.data.Options() options.experimental_slack = True ds = ds.with_options(options) return ds
def run(self): """ Training a neural model. Step 1: Create training model Step 2: Restore checkpoint/pretrain model/global_step if exists. Step 3: Fetch training data. Step 5: Fetch training training. Step 6: TRAIN!!! """ if self._hvd_backend == "horovod": import horovod.tensorflow.keras as hvd elif self._hvd_backend == "byteps": import byteps.tensorflow.keras as hvd tfds = training_utils.build_datasets(compat.ModeKeys.TRAIN, self.strategy, self.custom_dataset, self.task) if isinstance(self.custom_dataset, MultipleDataset): _tfds = None for _, ds in tfds.items(): if _tfds is None: _tfds = ds else: _tfds = _tfds.concatenate(ds) tfds = _tfds tfds = tfds.prefetch(tf.data.experimental.AUTOTUNE) # Step 1: create a model with training_utils.get_strategy_scope(self.strategy): inps = self.task.create_inputs(compat.ModeKeys.TRAIN) formatted_inps = self.task.example_to_input(inps, compat.ModeKeys.TRAIN) model_out = self.model(formatted_inps, is_training=True) for metric_layer in self.task.build_metric_layer(): model_out = metric_layer([formatted_inps, model_out]) if (LooseVersion(tf.__version__) < LooseVersion("2.3") or LooseVersion(tf.__version__) >= LooseVersion("2.5")): logging.info(f"Warning: Need further check on AccumgradKerasModel when TF version={tf.__version__}. " f"Here we ignore update_cycle={self._update_cycle}, " f"clip_value={self._clip_value}, clip_norm={self._clip_norm}.") keras_model = tf.keras.Model(inps, model_out) elif compat.IS_PREV_TF_2_4_0: from neurst.training.gradaccum_keras_model import TF23GradAccumKerasModel keras_model = TF23GradAccumKerasModel(inps, model_out, update_cycle=self._update_cycle, clip_value=self._clip_value, clip_norm=self._clip_norm, freeze_variables=self._freeze_variables) else: keras_model = GradAccumKerasModel(inps, model_out, update_cycle=self._update_cycle, clip_value=self._clip_value, clip_norm=self._clip_norm, freeze_variables=self._freeze_variables) loss = self._criterion.reduce_loss(formatted_inps, model_out) if compat.is_tf_tensor(loss) or isinstance(loss, (list, tuple)): keras_model.add_loss(loss) elif isinstance(loss, dict): for _name, _loss in loss.items(): keras_model.add_loss(_loss) keras_model.add_metric(_loss, name=_name + "_mean", aggregation="mean") else: raise ValueError("criterion.reduce_loss returns " "unsupported value of type: {}".format(type(loss))) self._restore_ckpt_or_pretrain() self._lr_schedule = build_lr_schedule(self._lr_schedule_args) if self._pruning_schedule is not None: self._optimizer = create_pruning_optimizer(self._optimizer, self.model, self._pruning_schedule, pruning_variable_pattern=self._pruning_variable_pattern, nopruning_variable_pattern=self._nopruning_variable_pattern, keep_prune_property=True) self._optimizer = training_utils.handle_fp16_and_distributed_optimizer( self._optimizer, self._lr_schedule, self._hvd_backend) if self._hvd_backend is None: keras_model.compile(self._optimizer) else: # NOTE: we already add Horovod DistributedOptimizer in `_handle_fp16_and_distributed_optimizer`. # Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow # uses hvd.DistributedOptimizer() to compute gradients. keras_model.compile(self._optimizer, experimental_run_tf_function=False) keras_model.summary() summary_model_variables(self.model, self._freeze_variables) # initialize the checkpoint manager _ = compat.get_saver_or_default(self.model, self.model_dir, max_to_keep=self._checkpoints_max_to_keep) # build training training if not self._tb_log_dir: self._tb_log_dir = os.path.join(self.model_dir, "train") training_callbacks = [MetricReductionCallback(self.strategy, self._summary_steps, self._tb_log_dir, device="GPU:0", lr_schedule=self._lr_schedule)] if self._hvd_backend is None or hvd.rank() == 0: training_callbacks.append( CustomCheckpointCallback(self.task.model_configs(self.model), save_checkpoint_steps=self._save_checkpoint_steps)) if self._validator is not None: training_callbacks.append(self._validator.build(self.strategy, self.task, self.model)) if self._hvd_backend is not None: # Horovod: average metrics among workers at the end of every epoch. # # Note: This callback must be in the list before the ReduceLROnPlateau, # TensorBoard or other metrics-based training. # NOTE!!! HERE we already integrate the metric averaging behaviour into the MetricReductionCallback. # training_callbacks.insert(0, hvd.callbacks.MetricAverageCallback(device="GPU:0")) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. training_callbacks.insert(0, hvd.callbacks.BroadcastGlobalVariablesCallback(0, device="GPU:0")) if self._lr_schedule is not None: training_callbacks.append(LearningRateScheduler(self._lr_schedule)) if self._experimental_count_batch_num: logging.info("Scanning the dataset......") iterator = iter(training_utils.maybe_distribution_dataset(self.strategy, tfds)) cnt = 0 for _ in iterator: cnt += 1 logging.info(f"Total {cnt} batches per EPOCH.") history = keras_model.fit( map_data_for_keras(tfds.repeat()), initial_epoch=0, epochs=1, steps_per_epoch=self._train_steps, # * args["update_cycle"], verbose=2, callbacks=training_callbacks) logging.info(history.history)
import tensorflow as tf tf.compat.v1.disable_eager_execution() from bert.dataset import create_masked_input_dataset from bert.layers import (PositionEmbedding, Attention, Transformer, TokenEmbedding, Bias, gelu, masked_sparse_cross_entropy_loss, InverseSquareRootSchedule, initializer, Projection) import horovod.tensorflow.keras as hvd # Horovod: initialize Horovod. hvd.init() # Print runtime config on head node if hvd.rank() == 0: print(arguments) # Horovod: pin GPU to be used to process local rank (one GPU per process) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') # import tensorflow_addons as tfa from tensorflow.keras import layers vocab_size = 22 max_seq_len = 1024
def main(args): # Hyper-parameters epochs = args.epochs lr = args.learning_rate batch_size = args.batch_size momentum = args.momentum weight_decay = args.weight_decay optimizer = args.optimizer # SageMaker options gpu_count = args.gpu_count training_dir = args.train validation_dir = args.validation eval_dir = args.eval tensorboard_logs = args.tensorboard_logs # Change 2: Initialize horovod and get the size of the cluster hvd.init() size = hvd.size() # Change 3 - Pin GPU to local process (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) train_dataset = get_dataset(training_dir + '/train.tfrecords', batch_size) val_dataset = get_dataset(validation_dir + '/validation.tfrecords', batch_size) eval_dataset = get_dataset(eval_dir + '/eval.tfrecords', batch_size) input_shape = (HEIGHT, WIDTH, DEPTH) # Change 6: Add callbacks for syncing initial state, and saving checkpoints only on 1st worker (rank 0) callbacks = [] callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) callbacks.append(hvd.callbacks.MetricAverageCallback()) callbacks.append( hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1)) callbacks.append( tf.keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1)) if hvd.rank() == 0: callbacks.append( ModelCheckpoint(args.output_data_dir + '/checkpoint-{epoch}.h5')) logdir = args.output_data_dir + '/' + datetime.now().strftime( "%Y%m%d-%H%M%S") callbacks.append(TensorBoard(log_dir=logdir)) callbacks.append(Sync2S3(logdir=logdir, s3logdir=tensorboard_logs)) # To use ResNet model instead of custom model comment the above line and uncomment the following: model = get_resnet_model(input_shape, lr, weight_decay, optimizer, momentum, hvd) # Train model # Change 7: Update the number of steps/epoch history = model.fit( train_dataset, steps_per_epoch=(NUM_TRAIN_IMAGES // batch_size) // size, validation_data=val_dataset, validation_steps=(NUM_VALID_IMAGES // batch_size) // size, verbose=1 if hvd.rank() == 0 else 0, epochs=epochs, callbacks=callbacks) # Evaluate model performance score = model.evaluate(eval_dataset, steps=NUM_TEST_IMAGES // args.batch_size, verbose=0) print('Test loss :', score[0]) print('Test accuracy:', score[1]) if hvd.rank() == 0: save_history(args.output_data_dir + "/hvd_history.p", history)
def generate_cae(zero_pad_train, zero_pad_test, preproc, dim_1, dim_2, train_mode, hvd_mode=False): # Shuffle idx_train = np.arange(np.shape(zero_pad_train)[0]) np.random.shuffle(idx_train) zero_pad_train = zero_pad_train[idx_train] # Just keeping a few aside for validation - due to memory limitations zero_pad_valid = zero_pad_train[-5:] zero_pad_train = zero_pad_train[:-5] idx_test = np.arange(np.shape(zero_pad_test)[0]) np.random.shuffle(idx_test) zero_pad_test = zero_pad_test[idx_test] # CNN training stuff weights_filepath = "../CAE_Training/cae_best_weights.h5" lrate = 0.001 # Get CAE model model, encoder, _ = cae_model() # design network my_adam = optimizers.Adam(lr=lrate, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False) if hvd_mode: my_adam = hvd.DistributedOptimizer(my_adam) earlystopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=20, verbose=0, mode='auto', baseline=None, restore_best_weights=False) callbacks_list = [earlystopping] if hvd_mode: callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0), # Horovod: average metrics among workers at the end of every epoch. # # Note: This callback must be in the list before the ReduceLROnPlateau, # TensorBoard or other metrics-based callbacks. hvd.callbacks.MetricAverageCallback(), # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during # the first three epochs. See https://arxiv.org/abs/1706.02677 for details. hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=3, verbose=1), ] # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks_list = callbacks + callbacks_list checkpoint = ModelCheckpoint(weights_filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min', save_weights_only=True) callbacks_list.append(checkpoint) # Horovod: write logs on worker 0. verbose = 1 if hvd.rank() == 0 else 0 model.compile(optimizer=my_adam, loss='mean_squared_error', metrics=[coeff_determination], experimental_run_tf_function=False) else: model.compile(optimizer=my_adam, loss='mean_squared_error', metrics=[coeff_determination]) checkpoint = ModelCheckpoint(weights_filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min', save_weights_only=True) callbacks_list.append(checkpoint) model.summary() num_epochs = num_epochs_space # fit network if train_mode: if hvd_mode: # train_history = model.fit(x=zero_pad_train, y=zero_pad_train, epochs=num_epochs, callbacks=callbacks_list, batch_size=batchsize_space,\ # validation_data=(zero_pad_valid,zero_pad_valid)) if hvd.rank() == 0: model.load_weights(weights_filepath) idx_train = sorted(range(len(idx_train)), key=lambda k: idx_train[k]) idx_test = sorted(range(len(idx_test)), key=lambda k: idx_test[k]) # Rejoin train and valid zero_pad_train = np.concatenate( (zero_pad_train, zero_pad_valid), axis=0) zero_pad_train = zero_pad_train[idx_train] zero_pad_test = zero_pad_test[idx_test] # Call to save latent space representation save_latent_space(model, encoder, zero_pad_train, zero_pad_test, preproc, dim_1, dim_2) else: train_history = model.fit(x=zero_pad_train, y=zero_pad_train,epochs=num_epochs, callbacks=callbacks_list, batch_size=batchsize_space,\ validation_data=(zero_pad_valid,zero_pad_valid)) model.load_weights(weights_filepath) idx_train = sorted(range(len(idx_train)), key=lambda k: idx_train[k]) idx_test = sorted(range(len(idx_test)), key=lambda k: idx_test[k]) # Rejoin train and valid zero_pad_train = np.concatenate((zero_pad_train, zero_pad_valid), axis=0) zero_pad_train = zero_pad_train[idx_train] zero_pad_test = zero_pad_test[idx_test] # Call to save latent space representation save_latent_space(model, encoder, zero_pad_train, zero_pad_test, preproc, dim_1, dim_2) return model
help='Integer. Number of epochs to train the model.', default=3) parser.add_argument('--steps_per_epoch', metavar='STEPS', type=int, help='Total number of steps (batches of samples)', default=1000) return parser.parse_args() def main(epochs, steps_per_epoch, hvd_rank=0, hvd_size=1): model = define_model() dataset = get_train_dataset(hvd_rank=hvd_rank, hvd_size=hvd_size) trained_model = train(model, dataset, epochs, steps_per_epoch, hvd_rank, hvd_size) test_images, test_labels = get_test_dataset() trained_model.evaluate(test_images, test_labels, verbose=1) if hvd_rank == 0: trained_model.save('result') if __name__ == '__main__': args = parse_args() hvd.init() main( epochs=args.epochs, steps_per_epoch=args.steps_per_epoch, hvd_rank=hvd.rank(), hvd_size=hvd.size(), )
def train_fn(compute_config: TfDataServiceConfig, reuse_dataset: bool = False, round_robin: bool = False): # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') with tf_data_service(compute_config, hvd.rank()) as dispatcher_address: # this lock guarantees only one training task downloads the dataset with FileLock(os.path.expanduser("~/.horovod_lock")): (mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data() dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64)) ) # Allow tf.data service to pre-process the pipeline dataset = dataset.repeat() \ .shuffle(10000) \ .batch(128) \ .apply(tf.data.experimental.service.distribute( service=dispatcher_address, processing_mode="distributed_epoch", job_name='job' if reuse_dataset else None, consumer_index=hvd.rank() if round_robin else None, num_consumers=hvd.size() if round_robin else None)) \ .prefetch(tf.data.experimental.AUTOTUNE) mnist_model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), tf.keras.layers.Dropout(0.25), tf.keras.layers.Flatten(), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(10, activation='softmax') ]) # Horovod: adjust learning rate based on number of GPUs. scaled_lr = 0.001 * hvd.size() opt = tf.optimizers.Adam(scaled_lr) # Horovod: add Horovod DistributedOptimizer. opt = hvd.DistributedOptimizer( opt, backward_passes_per_step=1, average_aggregated_gradients=True) # Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow # uses hvd.DistributedOptimizer() to compute gradients. mnist_model.compile(loss=tf.losses.SparseCategoricalCrossentropy(), optimizer=opt, metrics=['accuracy'], experimental_run_tf_function=False) callbacks = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0), hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=3, verbose=1), ] # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks.append(tf.keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) # Horovod: write logs on worker 0. verbose = 1 if hvd.rank() == 0 else 0 # Train the model. # Horovod: adjust number of steps based on number of GPUs. mnist_model.fit(dataset, steps_per_epoch=32 // hvd.size(), callbacks=callbacks, epochs=24, verbose=verbose)
# Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0), # Horovod: average metrics among workers at the end of every epoch. # # Note: This callback must be in the list before the ReduceLROnPlateau, # TensorBoard or other metrics-based callbacks. hvd.callbacks.MetricAverageCallback(), # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during # the first three epochs. See https://arxiv.org/abs/1706.02677 for details. # hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=1, initial_lr=scaled_lr, verbose=1), ] # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. # if hvd.rank() == 0: # callbacks.append(tf.keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) # Horovod: write logs on worker 0. verbose = 1 if hvd.rank() == 0 else 0 validation_data = (test_images, test_labels) if hvd.rank() == 0 else None validation_data = None # Train the model. # Horovod: adjust number of steps based on number of GPUs. # steps_per_epoch=500 // hvd.size(), model.fit(train_images, train_labels, validation_data=validation_data, epochs=3, batch_size=batch_size, \ steps_per_epoch=barches_total // hvd.size(), callbacks=callbacks, verbose=verbose)
def step(self, data_creator, epochs=1, verbose=1, callbacks=None, validation_data_creator=None, class_weight=None, steps_per_epoch=None, validation_steps=None, validation_freq=1, data_config=None): """Runs a training epoch and updates the model parameters.""" config = self.config.copy() if data_config is not None: config.update(data_config) # process datasets if self.backend == "horovod": import horovod.tensorflow.keras as hvd assert "batch_size" in config, "batch_size must be set in config" config["batch_size"] = config["batch_size"] // hvd.size() train_dataset = data_creator(config) if validation_data_creator is not None: test_dataset = validation_data_creator(config) else: test_dataset = None from tensorflow.python.distribute.input_ops import auto_shard_dataset train_dataset = auto_shard_dataset(train_dataset, hvd.size(), hvd.rank()) if test_dataset is not None: test_dataset = auto_shard_dataset(test_dataset, hvd.size(), hvd.rank()) elif self.backend == "tf-distributed": with self.strategy.scope(): train_dataset = data_creator(config) if validation_data_creator is not None: test_dataset = validation_data_creator(config) else: test_dataset = None else: train_dataset = data_creator(config) if validation_data_creator is not None: test_dataset = validation_data_creator(config) else: test_dataset = None # process other arguments if self.backend == "horovod": import horovod.tensorflow.keras as hvd hvd_callbacks = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0), hvd.callbacks.MetricAverageCallback() ] if hvd.rank() != 0: verbose = 0 if callbacks is not None: callbacks = hvd_callbacks + callbacks else: callbacks = hvd_callbacks elif self.backend == "tf-distributed": if self.strategy.cluster_resolver.task_id != 0: verbose = 0 history = self.model.fit(train_dataset, epochs=self.epoch + epochs, verbose=verbose, callbacks=callbacks, validation_data=test_dataset, class_weight=class_weight, initial_epoch=self.epoch, steps_per_epoch=steps_per_epoch, validation_steps=validation_steps, validation_freq=validation_freq) if history is None: stats = {} else: stats = {"train_" + k: v[-1] for k, v in history.history.items()} self.epoch += epochs return stats
def on_epoch_end(self, epoch, logs=None): epoch_run_time = time.time() - self.epoch_start if hvd.rank() == 0: print("epoch: %d time_taken: %.1f" % (epoch, epoch_run_time))
def run(config): seed = config["seed"] if seed is not None: np.random.seed(seed) if tf.__version__ == "1.13.1": tf.random.set_random_seed(seed) else: tf.compat.v2.random.set_seed(seed) load_config(config) input_shape, output_shape = setup_data(config) search_space = setup_search_space(config, input_shape, output_shape, seed=seed) # Initialize Horovod hvd.init() model_created = False try: model = search_space.create_model() model_created = True except: logger.info("Error: Model creation failed...") logger.info(traceback.format_exc()) if model_created: # Setup callbacks only callbacks = [] cb_requires_valid = False # Callbacks requires validation data callbacks_config = config["hyperparameters"].get("callbacks") if callbacks_config is not None: for cb_name, cb_conf in callbacks_config.items(): if cb_name in default_callbacks_config: # cb_bame in hvd_root_cb implies hvd.rank() == 0 if not (cb_name in hvd_root_cb) or hvd.rank() == 0: default_callbacks_config[cb_name].update(cb_conf) # Special dynamic parameters for callbacks if cb_name == "ModelCheckpoint": default_callbacks_config[cb_name][ "filepath"] = f'best_model_{config["id"]}.h5' # Import and create corresponding callback Callback = getattr(keras.callbacks, cb_name) callbacks.append( Callback(**default_callbacks_config[cb_name])) if cb_name in ["EarlyStopping"]: cb_requires_valid = "val" in cb_conf[ "monitor"].split("_") else: logger.error(f"'{cb_name}' is not an accepted callback!") trainer = HorovodTrainerTrainValid(config=config, model=model) callbacks.append( # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0)) trainer.callbacks.extend(callbacks) last_only, with_pred = preproc_trainer(config) last_only = last_only and not cb_requires_valid history = trainer.train(with_pred=with_pred, last_only=last_only) result = compute_objective(config["objective"], history) else: # penalising actions if model cannot be created result = -1 if result < -10: result = -10 return result
def train(model_func, params): image_width = params['image_width'] image_height = params['image_height'] image_format = params['image_format'] distort_color = params['distort_color'] momentum = params['momentum'] loss_scale = params['loss_scale'] data_dir = params['data_dir'] data_idx_dir = params['data_idx_dir'] batch_size = params['batch_size'] num_iter = params['num_iter'] iter_unit = params['iter_unit'] log_dir = params['log_dir'] export_dir = params['export_dir'] tensorboard_dir = params['tensorboard_dir'] display_every = params['display_every'] precision = params['precision'] dali_mode = params['dali_mode'] use_xla = params['use_xla'] if data_dir is not None: file_format = os.path.join(data_dir, '%s-*') train_files = sorted(tf.io.gfile.glob(file_format % 'train')) valid_files = sorted(tf.io.gfile.glob(file_format % 'validation')) num_train_samples = common.get_num_records(train_files) num_valid_samples = common.get_num_records(valid_files) else: num_train_samples = 1281982 num_valid_samples = 5000 train_idx_files = None valid_idx_files = None if data_idx_dir is not None: file_format = os.path.join(data_idx_dir, '%s-*') train_idx_files = sorted(tf.io.gfile.glob(file_format % 'train')) valid_idx_files = sorted(tf.io.gfile.glob(file_format % 'validation')) if iter_unit.lower() == 'epoch': num_epochs = num_iter nstep_per_epoch = num_train_samples // (batch_size * hvd.size()) nstep_per_valid = num_valid_samples // (batch_size * hvd.size()) else: assert iter_unit.lower() == 'batch' num_epochs = 1 nstep_per_epoch = min(num_iter, num_train_samples // (batch_size * hvd.size())) nstep_per_valid = min(10, num_valid_samples // (batch_size * hvd.size())) initial_epoch = 0 if log_dir: # We save check points only when using the real data. assert data_dir, "--data_dir cannot be empty when using --log_dir" assert os.path.exists(log_dir) ckpt_format = log_dir + "/model-{epoch:02d}-{val_top1:.2f}.hdf5" # Looks for the most recent checkpoint and sets the initial epoch from it. for filename in os.listdir(log_dir): if filename.startswith('model-'): initial_epoch = max(int(re.findall(r'\d+', filename)[0]), initial_epoch) if tensorboard_dir: assert os.path.exists(tensorboard_dir) if export_dir: assert os.path.exists(export_dir) save_format = export_dir + "/saved_model_rn50.h5" if use_xla: tf.config.optimizer.set_jit(True) # Horovod: pin GPU to be used to process local rank (one GPU per process) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') if precision == 'fp16': if tf.__version__ >= "2.4.0": policy = keras.mixed_precision.Policy('mixed_float16') keras.mixed_precision.set_global_policy(policy) else: policy = keras.mixed_precision.experimental.Policy( 'mixed_float16', loss_scale) keras.mixed_precision.experimental.set_policy(policy) lr_schedule = common.create_piecewise_constant_decay_with_warmup( batch_size=batch_size * hvd.size(), epoch_size=num_train_samples, warmup_epochs=common.LR_SCHEDULE[0][1], boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]), multipliers=list(p[0] for p in common.LR_SCHEDULE), compute_lr_on_cpu=True) opt = keras.optimizers.SGD(learning_rate=lr_schedule, momentum=momentum) # Horovod: add Horovod DistributedOptimizer. We use a modified version to # support the custom learning rate schedule. opt = hvd.DistributedOptimizer(opt) if tf.__version__ >= "2.4.0" and precision == 'fp16': opt = keras.mixed_precision.LossScaleOptimizer( opt, dynamic=False, initial_scale=loss_scale) backend.set_image_data_format(image_format) dtype = 'float16' if precision == 'fp16' else 'float32' backend.set_floatx(dtype) model = model_func(num_classes=image_processing.NUM_CLASSES) loss_func = 'sparse_categorical_crossentropy', top5 = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5, name='top5') top1 = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=1, name='top1') # Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow # uses hvd.DistributedOptimizer() to compute gradients. However, this option # will disable the overlapping of the data loading and compute and hurt the # performace if the model is not under the scope of distribution strategy # scope. model.compile(optimizer=opt, loss=loss_func, metrics=[top1, top5], experimental_run_tf_function=False) training_hooks = [] training_hooks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) training_hooks.append(_ProfileKerasFitCallback(batch_size, display_every)) if log_dir and hvd.rank() == 0: ckpt_callback = keras.callbacks.ModelCheckpoint( ckpt_format, monitor='val_top1', verbose=1, save_best_only=False, save_weights_only=False, save_frequency=1) training_hooks.append(ckpt_callback) if tensorboard_dir and hvd.rank() == 0: tensorboard_callback = tf.keras.callbacks.TensorBoard( log_dir=tensorboard_dir) training_hooks.append(tensorboard_callback) if data_dir is not None: num_preproc_threads = params['dali_threads'] if dali_mode else 10 train_input = image_processing.image_set( train_files, batch_size, image_height, image_width, training=True, distort_color=distort_color, deterministic=False, num_threads=num_preproc_threads, use_dali=dali_mode, idx_filenames=train_idx_files) valid_input = image_processing.image_set( valid_files, batch_size, image_height, image_width, training=False, distort_color=False, deterministic=False, num_threads=num_preproc_threads, use_dali=dali_mode, idx_filenames=valid_idx_files) if dali_mode: train_input = train_input.get_device_dataset() valid_input = valid_input.get_device_dataset() valid_params = { 'validation_data': valid_input, 'validation_steps': nstep_per_valid, 'validation_freq': 1 } else: train_input = image_processing.fake_image_set(batch_size, image_height, image_width) valid_params = {} try: verbose = 2 if hvd.rank() == 0 else 0 model.fit(train_input, epochs=num_epochs, callbacks=training_hooks, steps_per_epoch=nstep_per_epoch, verbose=verbose, initial_epoch=initial_epoch, **valid_params) except KeyboardInterrupt: print("Keyboard interrupt") if export_dir and hvd.rank() == 0: model.save(save_format) print(f"The model is saved to {save_format}")
layers.Dense(64, activation='relu'), layers.Dropout(0.5), layers.Dense(10, activation='softmax') ]) opt = hvd.DistributedOptimizer(tf.optimizers.Adam(0.01), backward_passes_per_step=1, average_aggregated_gradients=True) model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy']) callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)] if hvd.rank() == 0: callbacks.append( tf.keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) datagen = ImageDataGenerator(horizontal_flip=True) model.fit(datagen.flow(x_train, y_train, batch_size=8), callbacks=callbacks, epochs=3, verbose=(hvd.rank() == 0)) if hvd.rank() == 0: test_data = np.load('data_test.npz') x_test, y_test = test_data['x_test'], test_data['y_test'] preds = model.predict(x_test) acc_score = accuracy_score(y_test[:, 0], np.argmax(preds, axis=1)) print(f'Model accuracy is {acc_score}')
def main(args): # Horovod: initialize Horovod. hvd.init() if not args.use_only_cpu: # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) else: config = None K.set_session(tf.Session(config=config)) batch_size = 128 num_classes = 10 # Horovod: adjust number of epochs based on number of GPUs. epochs = int(math.ceil(args.num_epochs / hvd.size())) # Input image dimensions img_rows, img_cols = 28, 28 # The data, shuffled and split between train and test sets (x_train, y_train), (x_test, y_test) = mnist.load_data() if K.image_data_format() == "channels_first": x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) input_shape = (1, img_rows, img_cols) else: x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) input_shape = (img_rows, img_cols, 1) x_train = x_train.astype("float32") x_test = x_test.astype("float32") x_train /= 255 x_test /= 255 print("x_train shape:", x_train.shape) print(x_train.shape[0], "train samples") print(x_test.shape[0], "test samples") # Convert class vectors to binary class matrices y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) model = Sequential() model.add( Conv2D(32, kernel_size=(3, 3), activation="relu", input_shape=input_shape)) model.add(Conv2D(64, (3, 3), activation="relu")) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Dropout(0.25)) model.add(Flatten()) model.add(Dense(128, activation="relu")) model.add(Dropout(0.5)) model.add(Dense(num_classes, activation="softmax")) # Horovod: adjust learning rate based on number of GPUs. opt = keras.optimizers.Adadelta(1.0 * hvd.size()) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt, metrics=["accuracy"]) callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0) ] # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks.append( keras.callbacks.ModelCheckpoint( os.path.join(args.model_dir, "checkpoint-{epoch}.h5"))) model.fit( x_train, y_train, batch_size=batch_size, callbacks=callbacks, epochs=epochs, verbose=1 if hvd.rank() == 0 else 0, validation_data=(x_test, y_test), ) score = model.evaluate(x_test, y_test, verbose=0) print("Test loss:", score[0]) print("Test accuracy:", score[1])
def get_gradients(self, loss, params): assert len(params) == 1 return [tf.constant([float(hvd.rank())])]
def main(args): mpi = False if 'sourcedir.tar.gz' in args.tensorboard_dir: tensorboard_dir = re.sub('source/sourcedir.tar.gz', 'model', args.tensorboard_dir) else: tensorboard_dir = args.tensorboard_dir logging.info("Writing TensorBoard logs to {}".format(tensorboard_dir)) if 'sagemaker_mpi_enabled' in args.fw_params: if args.fw_params['sagemaker_mpi_enabled']: import horovod.tensorflow.keras as hvd mpi = True # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) else: hvd = None logging.info("Running with MPI={}".format(mpi)) logging.info("getting data") train_dataset = train_input_fn() eval_dataset = eval_input_fn() validation_dataset = validation_input_fn() logging.info("configuring model") model = keras_model_fn(args.learning_rate, args.weight_decay, args.optimizer, args.momentum, mpi, hvd) callbacks = [] if mpi: callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) callbacks.append(hvd.callbacks.MetricAverageCallback()) callbacks.append( hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1)) callbacks.append( tf.keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1)) if hvd.rank() == 0: callbacks.append( ModelCheckpoint(args.output_dir + '/checkpoint-{epoch}.h5')) callbacks.append( CustomTensorBoardCallback(log_dir=tensorboard_dir)) else: callbacks.append( ModelCheckpoint(args.output_dir + '/checkpoint-{epoch}.h5')) callbacks.append(CustomTensorBoardCallback(log_dir=tensorboard_dir)) logging.info("Starting training") size = 1 if mpi: size = hvd.size() model.fit( x=train_dataset[0], y=train_dataset[1], steps_per_epoch=(num_examples_per_epoch('train') // args.batch_size) // size, epochs=args.epochs, validation_data=validation_dataset, validation_steps=(num_examples_per_epoch('validation') // args.batch_size) // size, callbacks=callbacks) score = model.evaluate(eval_dataset[0], eval_dataset[1], steps=num_examples_per_epoch('eval') // args.batch_size, verbose=0) logging.info('Test loss:{}'.format(score[0])) logging.info('Test accuracy:{}'.format(score[1])) # Horovod: Save model only on worker 0 (i.e. master) if mpi: if hvd.rank() == 0: return save_model(model, args.model_output_dir) else: return save_model(model, args.model_output_dir)
def test_elastic_state(self): with self.test_session(config=self.config) as sess: K.set_session(sess) v = 1.0 if hvd.rank() == 0 else 2.0 model1 = tf.keras.Sequential( [tf.keras.layers.Dense(2, activation='softmax')]) model1.build((2, 2)) model1.set_weights([ np.array([[v, v], [v, v]], dtype=np.float32), np.array([v, v], dtype=np.float32) ]) model2 = tf.keras.Sequential( [tf.keras.layers.Dense(2, activation='softmax')]) model2.build((2, 2)) model2.set_weights([ np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32), np.array([0.0, 0.0], dtype=np.float32) ]) optimizer = tf.keras.optimizers.Adam(0.001 * hvd.size()) state = hvd.elastic.KerasState(model1, optimizer, batch=20 + hvd.rank(), epoch=10 + hvd.rank()) state.sync() model1_weights = model1.get_weights() model2_weights = model2.get_weights() # After sync, all values should match the root rank for w in state.model.get_weights(): self.assertAllClose(w, np.ones_like(w)) assert state.batch == 20 assert state.epoch == 10 # Partially modify then restore model1.set_weights(model2_weights) state.batch = 21 state.epoch = 11 state.restore() for w1, w2 in zip(model1.get_weights(), model1_weights): self.assertAllClose(w1, w2) assert state.batch == 20 assert state.epoch == 10 # Partially modify then commit model1.set_weights(model2_weights) state.batch = 21 state.epoch = 11 state.commit() state.restore() for w1, w2 in zip(model1.get_weights(), model2_weights): self.assertAllClose(w1, w2) assert state.batch == 21 assert state.epoch == 11
default='cpu', help='Wheter this is running on cpu or gpu') parser.add_argument('--num_inter', default=2, help='set number inter', type=int) parser.add_argument('--num_intra', default=0, help='set number intra', type=int) args = parser.parse_args() # Horovod: pin GPU to be used to process local rank (one GPU per process) print("I am rank %s of %s" % (hvd.rank(), hvd.size())) # Horovod: pin GPU to be used to process local rank (one GPU per process) if args.device == 'cpu': tf.config.threading.set_intra_op_parallelism_threads(args.num_intra) tf.config.threading.set_inter_op_parallelism_threads(args.num_inter) else: gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') (mnist_images, mnist_labels), _ = \ tf.keras.datasets.mnist.load_data(path='mnist.npz')
def train(model, train_images, train_annotations, input_height=None, input_width=None, n_classes=None, verify_dataset=True, checkpoints_path=None, epochs=5, batch_size=2, validate=False, val_images=None, val_annotations=None, auto_resume_checkpoint=False, load_weights=None, steps_per_epoch=None, val_steps_per_epoch=None, gen_use_multiprocessing=False, ignore_zero_class=False, optimizer_name='adam', do_augment=False, augmentation_name="aug_all", data_type='fp32', tb_location=None, deterministic=False, model_dir=None, dump_config=None, distributed=False, use_upsampling=False, loss_type=0, train_engine='hpu', not_cached=False): if train_engine == 'hpu': from habana_frameworks.tensorflow import load_habana_module load_habana_module() print("Loaded HPU modules") from TensorFlow.common.debug import dump_callback # For Habana Model runner hooks from TensorFlow.common.tb_utils import (TensorBoardWithHParamsV2, ExamplesPerSecondKerasHookV2) else: class dump_callback(object): def __init__(self, file_name): pass def __enter__(self): pass def __exit__(self, type, value, traceback): pass if data_type == 'bf16' and train_engine == 'hpu': bf16_json = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../bf16_segnet.json') os.environ['TF_BF16_CONVERSION'] = os.environ.get( 'TF_BF16_CONVERSION', bf16_json) print("Setting BF16:", os.getenv('TF_BF16_CONVERSION')) shard_id = 0 num_shards = 1 if distributed: import horovod.tensorflow.keras as hvd print("hvd init") hvd.init() if train_engine == 'gpu': gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices( gpus[hvd.local_rank()], 'GPU') print("Set memory growth for GPUS") shard_id = hvd.rank() num_shards = hvd.size() if num_shards == 1: print( "Distributed training requested but horovod init not success") exit() print("num_shards: " + str(num_shards) + " shard_id: " + str(shard_id)) from keras_segmentation.models.all_models import model_from_name # check if user gives model name instead of the model object if isinstance(model, six.string_types): # create the model from the name assert (n_classes is not None), "Please provide the n_classes" if (input_height is not None) and (input_width is not None): model = model_from_name[model](n_classes, input_height=input_height, input_width=input_width, batch_size=batch_size, use_upsampling=use_upsampling, loss_type=loss_type) else: model = model_from_name[model](n_classes, batch_size=batch_size, use_upsampling=use_upsampling, loss_type=loss_type) #model.save('my_segnet_model.h5') n_classes = model.n_classes input_height = model.input_height input_width = model.input_width output_height = model.output_height output_width = model.output_width if steps_per_epoch is None: steps_per_epoch = len( os.listdir(train_images)) // (batch_size * num_shards) if val_steps_per_epoch is None: val_steps_per_epoch = len(os.listdir(val_images)) // batch_size print("Steps per epoch: " + str(steps_per_epoch)) def optimized_xent_loss_custom_grad(ytrue, ypred): @tf.custom_gradient def loss_without_mean(ytrue, ypred): with tf.name_scope("softmax_cross_entropy"): logits_t = tf.transpose(ypred, perm=(0, 1, 3, 2), name="logits_t") # BS H N W reduce_max = tf.reduce_max(logits_t, 2, name="reduce_max") # BS H W max_logits = tf.expand_dims(reduce_max, 3) # BS H W 1 shifted_logits = tf.subtract(ypred, max_logits, name="shifted_logits") # BS H W N exp_shifted_logits = tf.math.exp( shifted_logits, name="exp_shifted_logits") # BS H W N reduce_sum_filter = tf.fill([1, 1, n_classes, 1], 1.0) sum_exp = tf.nn.conv2d(exp_shifted_logits, reduce_sum_filter, strides=1, padding="VALID", name="sum_exp") # BS H W 1 log_sum_exp = tf.math.log(sum_exp, name="log_sum_exp") # BS H W 1 shifted_logits2 = tf.nn.conv2d( shifted_logits * ytrue, reduce_sum_filter, strides=1, padding="VALID", name="shifted_logits2") # BS H W 1 loss = tf.subtract(log_sum_exp, shifted_logits2, name="loss/sub") # BS H W 1 def custom_grad(dy): # dy is BS H W 1 with tf.name_scope("gradients/softmax_cross_entropy"): div = tf.math.truediv(exp_shifted_logits, sum_exp, name="div") # BS H W N sub = tf.math.subtract(div, ytrue, name="sub") # BS H W N ret = tf.math.multiply(sub, dy, name="mul") return -dy * shifted_logits, ret return loss, custom_grad return tf.math.reduce_mean(loss_without_mean(ytrue, ypred)) if validate: assert val_images is not None assert val_annotations is not None if optimizer_name is not None: if ignore_zero_class: loss_k = masked_categorical_crossentropy elif loss_type == 1: loss_k = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True) elif loss_type == 2: loss_k = tf.keras.losses.CategoricalCrossentropy(from_logits=True) else: loss_k = optimized_xent_loss_custom_grad print(optimizer_name) if num_shards > 1: optimizer = Adam(lr=LearningRate) optimizer_name = hvd.DistributedOptimizer(optimizer) model.compile(loss=loss_k, optimizer=optimizer_name, metrics=['accuracy']) if checkpoints_path is not None: with open(checkpoints_path + "_config.json", "w") as f: json.dump( { "model_class": model.model_name, "n_classes": n_classes, "input_height": input_height, "input_width": input_width, "output_height": output_height, "output_width": output_width }, f) if load_weights is not None and len(load_weights) > 0: print("Loading weights from ", load_weights) status = model.load_weights(load_weights) print(status) if auto_resume_checkpoint and (checkpoints_path is not None): latest_checkpoint = find_latest_checkpoint(checkpoints_path) if latest_checkpoint is not None: print("Loading the weights from latest checkpoint ", latest_checkpoint) model.load_weights(latest_checkpoint) if verify_dataset: print("Verifying training dataset") verified = verify_segmentation_dataset(train_images, train_annotations, n_classes, deterministic) assert verified if validate: print("Verifying validation dataset") verified = verify_segmentation_dataset(val_images, val_annotations, n_classes, deterministic) assert verified if not_cached: train_gen = image_segmentation_generator( train_images, train_annotations, batch_size, n_classes, input_height, input_width, output_height, output_width, deterministic, do_augment=do_augment, augmentation_name=augmentation_name, num_shards=num_shards, shard_id=shard_id, loss_type=loss_type) else: train_gen = image_segmentation_generator( train_images, train_annotations, 1, n_classes, input_height, input_width, output_height, output_width, deterministic, do_augment=do_augment, augmentation_name=augmentation_name, num_shards=num_shards, shard_id=shard_id, loss_type=loss_type) train_gen = cached_image_generator(train_gen, num_shards, shard_id, batch_size, len(os.listdir(train_images)), deterministic) callbacks = [] if num_shards > 1: callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0)) callbacks.append(hvd.callbacks.MetricAverageCallback()) callbacks.append(CheckpointsCallback(checkpoints_path)) #if shard_id == 0: # callbacks.append(ModelCheckpoint( self.checkpoints_path, monitor='loss', verbose=2, mode='min', save_best_only=True, save_weights_only=True)) if model_dir is not None: hparams = { "model_name": model, "optimizer": optimizer_name, "batch_size": batch_size } if train_engine == 'hpu': callbacks += [ TensorBoardWithHParamsV2(hparams, log_dir=model_dir, update_freq=5), ExamplesPerSecondKerasHookV2(5, batch_size=batch_size, output_dir=model_dir) ] if tb_location != '': tensorboard_callback = TensorBoard(log_dir=tb_location, histogram_freq=1) callbacks.append(tensorboard_callback) print("TB:", tb_location) if not validate: with dump_callback(dump_config): start_compilation = time.time() model.fit(train_gen, steps_per_epoch=1, epochs=1) stop_compilation = time.time() history = model.fit(train_gen, steps_per_epoch=steps_per_epoch, epochs=epochs, callbacks=callbacks, verbose=1 if shard_id == 0 else 0) stop_training = time.time() with open('./trainHistoryDict_' + str(shard_id), 'wb') as file_pi: pickle.dump(history.history, file_pi) avg_time_per_batch = (stop_training - stop_compilation) / (steps_per_epoch * epochs) print('Compile time in seconds:', (stop_compilation - start_compilation)) print('Average time per batch in seconds (leaving out compilation):', avg_time_per_batch) print('Average time per image in seconds (leaving out compilation)', avg_time_per_batch / batch_size) print('Average images per sec (leaving out compilation):', batch_size / avg_time_per_batch) if loss_type == 1: print('Eval for LOSS_FUNC_TYPE=1 is WIP') exit() if shard_id == 0: if not_cached: val_gen = image_segmentation_generator(val_images, val_annotations, batch_size, n_classes, input_height, input_width, output_height, output_width, deterministic, num_shards=1, shard_id=shard_id, loss_type=loss_type) else: val_gen = image_segmentation_generator(val_images, val_annotations, 1, n_classes, input_height, input_width, output_height, output_width, deterministic, num_shards=1, shard_id=shard_id, loss_type=loss_type) val_gen = cached_image_generator(val_gen, 1, 0, batch_size, len(os.listdir(val_images))) f1_metric = FBetaScore(num_classes=n_classes) model.compile(loss=model.loss, metrics=[ tf.keras.metrics.CategoricalAccuracy( name="categorical_accuracy", dtype=None), f1_metric ]) test_loss, test_acc, test_f1 = model.evaluate( val_gen, steps=(len(os.listdir(val_images)) // batch_size)) train_loss, train_acc, train_f1 = model.evaluate( train_gen, steps=(len(os.listdir(train_images)) // batch_size)) print( f'test loss : {test_loss}, test accuracy : {test_acc}, test f1 : {test_f1}' ) print( f'train loss : {train_loss}, train accuracy : {train_acc}, train f1 : {train_f1}' ) else: assert ( num_shards is 1), "Only support training with validation with single HPU setup" if not_cached: val_gen = image_segmentation_generator(val_images, val_annotations, batch_size, n_classes, input_height, input_width, output_height, output_width, deterministic, num_shards=num_shards, shard_id=shard_id, loss_type=loss_type) else: val_gen = image_segmentation_generator(val_images, val_annotations, 1, n_classes, input_height, input_width, output_height, output_width, deterministic, num_shards=num_shards, shard_id=shard_id, loss_type=loss_type) val_gen = cached_image_generator(val_gen, num_shards, shard_id, batch_size, len(os.listdir(val_images)), deterministic) start_compilation = time.time() model.fit(train_gen, steps_per_epoch=1, epochs=1) stop_compilation = time.time() model.fit(train_gen, steps_per_epoch=steps_per_epoch, validation_data=val_gen, validation_steps=val_steps_per_epoch, epochs=epochs, callbacks=callbacks, use_multiprocessing=gen_use_multiprocessing, verbose=1 if shard_id == 0 else 0) stop_training = time.time() avg_time_per_batch = (stop_training - stop_compilation) / (steps_per_epoch * epochs) print('Compile time in seconds:', (stop_compilation - start_compilation)) print('Average time per batch in seconds (leaving out compilation):', avg_time_per_batch) print('Average time per image in seconds (leaving out compilation)', avg_time_per_batch / batch_size)
import tensorflow as tf import horovod.tensorflow.keras as hvd # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') (mnist_images, mnist_labels), _ = \ tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % hvd.rank()) dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32), tf.cast(mnist_labels, tf.int64))) dataset = dataset.repeat().shuffle(10000).batch(128) mnist_model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32, [3, 3], activation='relu'), tf.keras.layers.Conv2D(64, [3, 3], activation='relu'), tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), tf.keras.layers.Dropout(0.25), tf.keras.layers.Flatten(), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(10, activation='softmax')
import horovod.tensorflow.keras as hvd import tensorflow as tf from tensorflow.keras.models import load_model from tensorflow.keras.optimizers import Adam from argparse import ArgumentParser from datetime import datetime from time import perf_counter from tqdm import tqdm import json import os # Get the rank information hvd.init() rank = hvd.rank() size = hvd.size() # Hard-coded paths for data and model _data_path = os.path.join('..', '..', 'data', 'output', 'water_clusters.proto') _model_path = os.path.join('..', 'model.h5') if __name__ == "__main__": # Parse the arguments arg_parser = ArgumentParser() arg_parser.add_argument('--batch-sizes', '-b', nargs='*', default=[32], help='Batch size for each rank', type=int)
ROOT = '/mnt/bb/$USERID' #load Keras model model = tf.keras.applications.DenseNet169(weights=None, include_top=True, input_shape=(IMG_SIZE, IMG_SIZE, 3), classes=2) # compile the model model.compile(loss="categorical_crossentropy", optimizer=hvd.DistributedOptimizer( tf.keras.optimizers.Adam(lr=LRATE * hvd.size())), metrics=["accuracy"], experimental_run_tf_function=False) if hvd.rank() == 0: print(model.summary()) verbose = 1 if hvd.rank() == 0 else 0 cbs = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0), hvd.callbacks.MetricAverageCallback(), hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=WUP, verbose=verbose), # Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs. hvd.callbacks.LearningRateScheduleCallback(start_epoch=WUP, end_epoch=WUP + 10, multiplier=1.), hvd.callbacks.LearningRateScheduleCallback(start_epoch=WUP + 10, end_epoch=WUP + 20,
def train_fn(model_bytes): # Make sure pyarrow is referenced before anything else to avoid segfault due to conflict # with TensorFlow libraries. Use `pa` package reference to ensure it's loaded before # functions like `deserialize_model` which are implemented at the top level. # See https://jira.apache.org/jira/browse/ARROW-3346 pa import atexit import horovod.tensorflow.keras as hvd import os from petastorm import make_batch_reader from petastorm.tf_utils import make_petastorm_dataset import tempfile import tensorflow as tf import tensorflow.keras.backend as K import shutil # Horovod: initialize Horovod inside the trainer. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process), if GPUs are available. config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) # Horovod: restore from checkpoint, use hvd.load_model under the hood. model = deserialize_model(model_bytes, hvd.load_model) # Horovod: adjust learning rate based on number of processes. scaled_lr = K.get_value(model.optimizer.lr) * hvd.size() K.set_value(model.optimizer.lr, scaled_lr) # Horovod: print summary logs on the first worker. verbose = 2 if hvd.rank() == 0 else 0 callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(root_rank=0), # Horovod: average metrics among workers at the end of every epoch. # # Note: This callback must be in the list before the ReduceLROnPlateau, # TensorBoard, or other metrics-based callbacks. hvd.callbacks.MetricAverageCallback(), # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during # the first five epochs. See https://arxiv.org/abs/1706.02677 for details. hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, initial_lr=scaled_lr, verbose=verbose), # Reduce LR if the metric is not improved for 10 epochs, and stop training # if it has not improved for 20 epochs. tf.keras.callbacks.ReduceLROnPlateau(monitor='val_exp_rmspe', patience=10, verbose=verbose), tf.keras.callbacks.EarlyStopping(monitor='val_exp_rmspe', mode='min', patience=20, verbose=verbose), tf.keras.callbacks.TerminateOnNaN() ] # Model checkpoint location. ckpt_dir = tempfile.mkdtemp() ckpt_file = os.path.join(ckpt_dir, 'checkpoint.h5') atexit.register(lambda: shutil.rmtree(ckpt_dir)) # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks.append( tf.keras.callbacks.ModelCheckpoint(ckpt_file, monitor='val_exp_rmspe', mode='min', save_best_only=True)) # Make Petastorm readers. with make_batch_reader( '%s/train_df.parquet' % args.data_dir, num_epochs=None, cur_shard=hvd.rank(), shard_count=hvd.size(), hdfs_driver=PETASTORM_HDFS_DRIVER) as train_reader: with make_batch_reader( '%s/val_df.parquet' % args.data_dir, num_epochs=None, cur_shard=hvd.rank(), shard_count=hvd.size(), hdfs_driver=PETASTORM_HDFS_DRIVER) as val_reader: # Convert readers to tf.data.Dataset. train_ds = make_petastorm_dataset(train_reader) \ .apply(tf.data.experimental.unbatch()) \ .shuffle(int(train_rows / hvd.size())) \ .batch(args.batch_size) \ .map(lambda x: (tuple(getattr(x, col) for col in all_cols), tf.log(x.Sales))) val_ds = make_petastorm_dataset(val_reader) \ .apply(tf.data.experimental.unbatch()) \ .batch(args.batch_size) \ .map(lambda x: (tuple(getattr(x, col) for col in all_cols), tf.log(x.Sales))) history = model.fit( train_ds, validation_data=val_ds, steps_per_epoch=int(train_rows / args.batch_size / hvd.size()), validation_steps=int(val_rows / args.batch_size / hvd.size()), callbacks=callbacks, verbose=verbose, epochs=args.epochs) # Dataset API usage currently displays a wall of errors upon termination. # This global model registration ensures clean termination. # Tracked in https://github.com/tensorflow/tensorflow/issues/24570 globals()['_DATASET_FINALIZATION_HACK'] = model if hvd.rank() == 0: with open(ckpt_file, 'rb') as f: return history.history, f.read()
# Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt) model.compile(loss=tf.keras.losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy']) callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0), ] # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks.append(tf.keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) model.fit(x_train, y_train, batch_size=batch_size, callbacks=callbacks, epochs=epochs, verbose=1, validation_data=(x_test, y_test)) score = model.evaluate(x_test, y_test, verbose=0) print('Test loss:', score[0]) print('Test accuracy:', score[1]) # Horovod: Save model only on worker 0 (i.e. master) if hvd.rank() == 0:
image = tf.image.resize(image, (224, 224)) label = tf.cast(features['image/class/label'], tf.int64) label = tf.one_hot(label, 1001) return image, label data_dir = '/scratch/snx3000/stud50/imagenet/' list_of_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir)] dataset = tf.data.Dataset.list_files(list_of_files) dataset = dataset.interleave(tf.data.TFRecordDataset, cycle_length=120, block_length=1) dataset = dataset.map(decode) dataset = dataset.batch(128) dataset = dataset.shard(hvd.size(), hvd.rank()) model = tf.keras.applications.InceptionV3(weights=None, input_shape=(224, 224, 3), classes=1001) optimizer = tf.keras.optimizers.SGD(lr=0.01, momentum=0.9) optimizer = hvd.DistributedOptimizer(optimizer) model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy']) hvd_callback = hvd.callbacks.BroadcastGlobalVariablesCallback(0) fit = model.fit(dataset, epochs=1, callbacks=[hvd_callback])