Python rank 예제들, horovod.tensorflow.keras.rank Python 예제들

예제 #1

0

파일 보기

파일: cifar10-multi-gpu-horovod-sagemaker.py 프로젝트: veerathp/aws-immersion-ml-public

def main(args):
    # Hyper-parameters
    epochs = args.epochs
    lr = args.learning_rate
    batch_size = args.batch_size
    momentum = args.momentum
    weight_decay = args.weight_decay
    optimizer = args.optimizer

    # SageMaker options
    gpu_count = args.gpu_count
    training_dir = args.train
    validation_dir = args.validation
    eval_dir = args.eval
    tensorboard_logs = args.tensorboard_logs

    hvd.init()
    size = hvd.size()

    # Change 3 - pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.Session(config=config))

    train_dataset = make_batch(training_dir + '/train.tfrecords', batch_size)
    val_dataset = make_batch(validation_dir + '/validation.tfrecords',
                             batch_size)
    eval_dataset = make_batch(eval_dir + '/eval.tfrecords', batch_size)

    input_shape = (HEIGHT, WIDTH, DEPTH)

    callbacks = []
    callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
    callbacks.append(hvd.callbacks.MetricAverageCallback())
    callbacks.append(
        hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1))
    callbacks.append(
        tf.keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1))
    if hvd.rank() == 0:
        callbacks.append(
            ModelCheckpoint(args.output_data_dir + '/checkpoint-{epoch}.h5'))
        logdir = args.output_data_dir + '/' + datetime.now().strftime(
            "%Y%m%d-%H%M%S")
        callbacks.append(TensorBoard(log_dir=logdir, profile_batch=0))
        callbacks.append(Sync2S3(logdir=logdir, s3logdir=tensorboard_logs))

    model = get_model(lr, weight_decay, optimizer, momentum, hvd)

    # Train model
    history = model.fit(
        x=train_dataset[0],
        y=train_dataset[1],
        steps_per_epoch=(NUM_TRAIN_IMAGES // batch_size) // size,
        validation_data=val_dataset,
        validation_steps=(NUM_VALID_IMAGES // batch_size) // size,
        epochs=epochs,
        callbacks=callbacks)

    # Evaluate model performance
    score = model.evaluate(eval_dataset[0],
                           eval_dataset[1],
                           steps=NUM_TEST_IMAGES // args.batch_size,
                           verbose=0)
    print('Test loss    :', score[0])
    print('Test accuracy:', score[1])

    if hvd.rank() == 0:
        save_history(args.output_data_dir + "/hvd_history.p", history)

예제 #2

0

파일 보기

파일: dataset_factory.py 프로젝트: quuhua911/DeepLearningExamples

    def pipeline(self, dataset: tf.data.Dataset) -> tf.data.Dataset:
        """Build a pipeline fetching, shuffling, and preprocessing the dataset.

    Args:
      dataset: A `tf.data.Dataset` that loads raw files.

    Returns:
      A TensorFlow dataset outputting batched images and labels.
    """
        # This can help resolve OOM issues when using only 1 GPU for training
        options = tf.data.Options()
        options.experimental_optimization.map_parallelization = (
            not self.disable_map_parallelization)
        dataset = dataset.with_options(options)

        if self._num_gpus > 1:
            # For multi-host training, we want each hosts to always process the same
            # subset of files.  Each host only sees a subset of the entire dataset,
            # allowing us to cache larger datasets in memory.
            dataset = dataset.shard(self._num_gpus, hvd.rank())

        if self.is_training:
            # Shuffle the input files.
            dataset.shuffle(buffer_size=self._file_shuffle_buffer_size)

        if self.is_training and not self._cache:
            dataset = dataset.repeat()

        # Read the data from disk in parallel
        dataset = dataset.interleave(
            tf.data.TFRecordDataset,
            cycle_length=10,
            block_length=1,
            num_parallel_calls=tf.data.experimental.AUTOTUNE)

        if self._cache:
            dataset = dataset.cache()

        if self.is_training:
            dataset = dataset.shuffle(self._shuffle_buffer_size)
            dataset = dataset.repeat()

        # Parse, pre-process, and batch the data in parallel
        preprocess = self.parse_record
        dataset = dataset.map(preprocess,
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)
        if self._num_gpus > 1:
            # The batch size of the dataset will be multiplied by the number of
            # replicas automatically when strategy.distribute_datasets_from_function
            # is called, so we use local batch size here.
            dataset = dataset.batch(self.local_batch_size,
                                    drop_remainder=self.is_training)
        else:
            dataset = dataset.batch(self.global_batch_size,
                                    drop_remainder=self.is_training)

        # apply Mixup/CutMix only during training, if requested in the data pipeline,
        # otherwise they will be applied in the model module on device
        mixup_alpha = self.mixup_alpha if self.is_training else 0.0
        cutmix_alpha = self.cutmix_alpha if self.is_training else 0.0
        dataset = dataset.map(functools.partial(mixing, self.local_batch_size,
                                                mixup_alpha, cutmix_alpha,
                                                self.defer_img_mixing),
                              num_parallel_calls=64)

        # Assign static batch size dimension
        # dataset = dataset.map(
        #     functools.partial(self.set_shapes, batch_size),
        #     num_parallel_calls=tf.data.experimental.AUTOTUNE)

        # Prefetch overlaps in-feed with training
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

        return dataset

예제 #3

0

파일 보기

    opt = hvd.DistributedOptimizer(opt)

    model.compile(loss=keras.losses.sparse_categorical_crossentropy,
                  optimizer=opt,
                  metrics=['accuracy'])

    callbacks = [
        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),
        MyThresholdCallback(threshold=0.05)
    ]

    # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
    if hvd.rank() == 0:
        callbacks.append(keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))

    start = time()
    model.fit(train_generator,
              epochs=epochs,
              verbose=1,
              batch_size=batch_size,
              callbacks=callbacks,
              validation_data=val_generator)

    print(f"Total training time: {time() - start} seconds")


    score = model.evaluate(test_generator, verbose=0)

예제 #4

0

파일 보기

    def test_gradient_aggregation(self):
        class TestingOptimizer(optimizer_v2.OptimizerV2):
            """
            Custom optimizer we use for testing gradient aggregation.
            """
            def get_config(self):
                config = super(TestingOptimizer, self).get_config()
                return config

            def _create_slots(self, var_list):
                # Only needed for TF < 2.2.
                pass

            def _resource_apply_dense(self, grad, var, apply_state=None):
                return var.assign_add(grad)

        backward_passes_per_step = 4
        hvd_optimizer = hvd.DistributedOptimizer(
            optimizer=TestingOptimizer("test"),
            backward_passes_per_step=backward_passes_per_step,
            average_aggregated_gradients=True,
        )
        _ = hvd_optimizer.iterations

        def compute_expected_value(batch_id):
            sum_per_aggregation = 0.0
            for _ in range(backward_passes_per_step):
                grads_for_batch = 0.0
                for rank in range(hvd.size()):
                    grads_for_batch += rank

                # Apply `average_aggregated_gradients`.
                grads_for_batch /= float(backward_passes_per_step)

                # Averages across workers.
                sum_per_aggregation += grads_for_batch / float(hvd.size())

            aggregations_completed = math.floor(
                (batch_id + 1) / backward_passes_per_step)
            return aggregations_completed * sum_per_aggregation

        @tf.function
        def apply_gradients_in_tf_function(gradient_updates, model_variables,
                                           **kwargs):
            # Apply gradient updates in tf.function to reproduce how it is
            # done inside `model.fit()`.
            hvd_optimizer.apply_gradients(
                zip(gradient_updates, model_variables), **kwargs)

        gradients = [tf.constant([float(hvd.rank())])]
        variables = [tf.Variable([0.0])]
        for idx in range(10):
            if _PRE_TF_2_2_0:
                updated_gradients = hvd_optimizer._allreduce(
                    gradients, variables)
                apply_gradients_in_tf_function(updated_gradients, variables)
            elif _PRE_TF_2_4_0:
                # In 2.2 and 2.3 the horovod optimizer sets `_HAS_AGGREGATE_GRAD = True`.
                # This configures tf.keras to call `_aggregate_gradients()` outside of
                # `apply_gradients()` and to set `experimental_aggregate_gradients` to
                # False when calling `apply_gradients()` to prevent it from calling
                # `_aggregate_gradients()` again.
                updated_gradients = hvd_optimizer._aggregate_gradients(
                    zip(gradients, variables))
                apply_gradients_in_tf_function(
                    updated_gradients,
                    variables,
                    experimental_aggregate_gradients=False)
            else:
                raise RuntimeError("This test should be skipped ...")

            updated_variable_value = variables[0][0].numpy()
            assert updated_variable_value == compute_expected_value(idx)
            assert idx + 1 == hvd_optimizer.iterations.numpy()

예제 #5

0

파일 보기

파일: tf2-dvc-cnn-simple-hvd.py 프로젝트: 5l1v3r1/intro-to-dl-1

from tensorflow.keras.preprocessing.image import (ImageDataGenerator, 
                                                  array_to_img, 
                                                  img_to_array, load_img)
from tensorflow.keras import applications, optimizers

from tensorflow.keras.callbacks import TensorBoard

import numpy as np

# Horovod: import
import horovod.tensorflow.keras as hvd

# Horovod: initialize Horovod
hvd.init()

if hvd.rank() == 0:
    print('Using Tensorflow version:', tf.__version__,
          'Keras version:', tf.keras.__version__,
          'backend:', tf.keras.backend.backend())
    print('Using Horovod with', hvd.size(), 'workers')

# Horovod: pin GPU to be used to process local rank (one GPU per process)
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
if gpus:
    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')

# ## Data
# 
# The training dataset consists of 2000 images of dogs and cats, split

예제 #6

0

파일 보기

파일: image_processing.py 프로젝트: linhduongtuan/DALI

def image_set(filenames,
              batch_size,
              height,
              width,
              training=False,
              distort_color=False,
              num_threads=10,
              nsummary=10,
              deterministic=False,
              use_dali=None,
              idx_filenames=None):
    if use_dali:
        if idx_filenames is None:
            raise ValueError("Must provide idx_filenames if Dali is enabled")

        preprocessor = DALIPreprocessor(
            filenames,
            idx_filenames,
            height,
            width,
            batch_size,
            num_threads,
            dali_cpu=True if use_dali == 'CPU' else False,
            deterministic=deterministic,
            training=training)
        return preprocessor
    else:
        shuffle_buffer_size = 10000
        num_readers = 10
        ds = tf.data.Dataset.from_tensor_slices(filenames)

        # AUTOTUNE can give better perf for non-horovod cases
        thread_config = num_threads

        # shard should be before any randomizing operations
        if training:
            ds = ds.shard(hvd.size(), hvd.rank())

        # read up to num_readers files and interleave their records
        ds = ds.interleave(tf.data.TFRecordDataset, cycle_length=num_readers)

        if training:
            # Improve training performance when training data is in remote storage and
            # can fit into worker memory.
            ds = ds.cache()

        if training:
            # shuffle data before repeating to respect epoch boundaries
            ds = ds.shuffle(shuffle_buffer_size)
            ds = ds.repeat()

        preproc_func = (lambda record: _parse_and_preprocess_image_record(
            record,
            height,
            width,
            deterministic=deterministic,
            random_crop=training,
            distort_color=distort_color))
        ds = ds.map(preproc_func, num_parallel_calls=thread_config)

        ds = ds.batch(batch_size, drop_remainder=True)

        # prefetching
        ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

        options = tf.data.Options()
        options.experimental_slack = True
        ds = ds.with_options(options)

        return ds

예제 #7

0

파일 보기

    def run(self):
        """ Training a neural model.

        Step 1: Create training model
        Step 2: Restore checkpoint/pretrain model/global_step if exists.
        Step 3: Fetch training data.
        Step 5: Fetch training training.
        Step 6: TRAIN!!!
        """
        if self._hvd_backend == "horovod":
            import horovod.tensorflow.keras as hvd
        elif self._hvd_backend == "byteps":
            import byteps.tensorflow.keras as hvd

        tfds = training_utils.build_datasets(compat.ModeKeys.TRAIN, self.strategy,
                                             self.custom_dataset, self.task)
        if isinstance(self.custom_dataset, MultipleDataset):
            _tfds = None
            for _, ds in tfds.items():
                if _tfds is None:
                    _tfds = ds
                else:
                    _tfds = _tfds.concatenate(ds)
            tfds = _tfds
        tfds = tfds.prefetch(tf.data.experimental.AUTOTUNE)
        # Step 1: create a model
        with training_utils.get_strategy_scope(self.strategy):
            inps = self.task.create_inputs(compat.ModeKeys.TRAIN)
            formatted_inps = self.task.example_to_input(inps, compat.ModeKeys.TRAIN)
            model_out = self.model(formatted_inps, is_training=True)
            for metric_layer in self.task.build_metric_layer():
                model_out = metric_layer([formatted_inps, model_out])
            if (LooseVersion(tf.__version__) < LooseVersion("2.3")
                or LooseVersion(tf.__version__) >= LooseVersion("2.5")):
                logging.info(f"Warning: Need further check on AccumgradKerasModel when TF version={tf.__version__}. "
                             f"Here we ignore update_cycle={self._update_cycle}, "
                             f"clip_value={self._clip_value}, clip_norm={self._clip_norm}.")
                keras_model = tf.keras.Model(inps, model_out)
            elif compat.IS_PREV_TF_2_4_0:
                from neurst.training.gradaccum_keras_model import TF23GradAccumKerasModel
                keras_model = TF23GradAccumKerasModel(inps, model_out,
                                                      update_cycle=self._update_cycle,
                                                      clip_value=self._clip_value,
                                                      clip_norm=self._clip_norm,
                                                      freeze_variables=self._freeze_variables)
            else:
                keras_model = GradAccumKerasModel(inps, model_out,
                                                  update_cycle=self._update_cycle,
                                                  clip_value=self._clip_value,
                                                  clip_norm=self._clip_norm,
                                                  freeze_variables=self._freeze_variables)

            loss = self._criterion.reduce_loss(formatted_inps, model_out)
            if compat.is_tf_tensor(loss) or isinstance(loss, (list, tuple)):
                keras_model.add_loss(loss)
            elif isinstance(loss, dict):
                for _name, _loss in loss.items():
                    keras_model.add_loss(_loss)
                    keras_model.add_metric(_loss, name=_name + "_mean", aggregation="mean")
            else:
                raise ValueError("criterion.reduce_loss returns "
                                 "unsupported value of type: {}".format(type(loss)))
            self._restore_ckpt_or_pretrain()
            self._lr_schedule = build_lr_schedule(self._lr_schedule_args)
            if self._pruning_schedule is not None:
                self._optimizer = create_pruning_optimizer(self._optimizer, self.model, self._pruning_schedule,
                                                           pruning_variable_pattern=self._pruning_variable_pattern,
                                                           nopruning_variable_pattern=self._nopruning_variable_pattern,
                                                           keep_prune_property=True)
            self._optimizer = training_utils.handle_fp16_and_distributed_optimizer(
                self._optimizer, self._lr_schedule, self._hvd_backend)
            if self._hvd_backend is None:
                keras_model.compile(self._optimizer)
            else:
                # NOTE: we already add Horovod DistributedOptimizer in `_handle_fp16_and_distributed_optimizer`.
                # Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow
                # uses hvd.DistributedOptimizer() to compute gradients.
                keras_model.compile(self._optimizer, experimental_run_tf_function=False)
            keras_model.summary()
            summary_model_variables(self.model, self._freeze_variables)
        # initialize the checkpoint manager
        _ = compat.get_saver_or_default(self.model, self.model_dir, max_to_keep=self._checkpoints_max_to_keep)
        # build training training
        if not self._tb_log_dir:
            self._tb_log_dir = os.path.join(self.model_dir, "train")

        training_callbacks = [MetricReductionCallback(self.strategy, self._summary_steps, self._tb_log_dir,
                                                      device="GPU:0", lr_schedule=self._lr_schedule)]
        if self._hvd_backend is None or hvd.rank() == 0:
            training_callbacks.append(
                CustomCheckpointCallback(self.task.model_configs(self.model),
                                         save_checkpoint_steps=self._save_checkpoint_steps))
            if self._validator is not None:
                training_callbacks.append(self._validator.build(self.strategy, self.task, self.model))
        if self._hvd_backend is not None:
            # Horovod: average metrics among workers at the end of every epoch.
            #
            # Note: This callback must be in the list before the ReduceLROnPlateau,
            # TensorBoard or other metrics-based training.
            # NOTE!!! HERE we already integrate the metric averaging behaviour into the MetricReductionCallback.
            # training_callbacks.insert(0, hvd.callbacks.MetricAverageCallback(device="GPU:0"))

            # Horovod: broadcast initial variable states from rank 0 to all other processes.
            # This is necessary to ensure consistent initialization of all workers when
            # training is started with random weights or restored from a checkpoint.
            training_callbacks.insert(0, hvd.callbacks.BroadcastGlobalVariablesCallback(0, device="GPU:0"))
            if self._lr_schedule is not None:
                training_callbacks.append(LearningRateScheduler(self._lr_schedule))

        if self._experimental_count_batch_num:
            logging.info("Scanning the dataset......")
            iterator = iter(training_utils.maybe_distribution_dataset(self.strategy, tfds))
            cnt = 0
            for _ in iterator:
                cnt += 1
            logging.info(f"Total {cnt} batches per EPOCH.")

        history = keras_model.fit(
            map_data_for_keras(tfds.repeat()),
            initial_epoch=0,
            epochs=1,
            steps_per_epoch=self._train_steps,  # * args["update_cycle"],
            verbose=2,
            callbacks=training_callbacks)
        logging.info(history.history)

예제 #8

0

파일 보기

파일: training_horovod_single_aa.py 프로젝트: pstjohn/uniparc_modeling

import tensorflow as tf
tf.compat.v1.disable_eager_execution()

from bert.dataset import create_masked_input_dataset
from bert.layers import (PositionEmbedding, Attention, Transformer,
                         TokenEmbedding, Bias, gelu,
                         masked_sparse_cross_entropy_loss,
                         InverseSquareRootSchedule, initializer, Projection)

import horovod.tensorflow.keras as hvd

# Horovod: initialize Horovod.
hvd.init()

# Print runtime config on head node
if hvd.rank() == 0:
    print(arguments)

# Horovod: pin GPU to be used to process local rank (one GPU per process)
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
if gpus:
    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')

# import tensorflow_addons as tfa
from tensorflow.keras import layers

vocab_size = 22
max_seq_len = 1024

예제 #9

0

파일 보기

파일: cifar10-tf-horovod-sagemaker.py 프로젝트: shashankprasanna/nvidia-gtc-fall-2020-talks

def main(args):
    # Hyper-parameters
    epochs = args.epochs
    lr = args.learning_rate
    batch_size = args.batch_size
    momentum = args.momentum
    weight_decay = args.weight_decay
    optimizer = args.optimizer

    # SageMaker options
    gpu_count = args.gpu_count
    training_dir = args.train
    validation_dir = args.validation
    eval_dir = args.eval
    tensorboard_logs = args.tensorboard_logs

    # Change 2: Initialize horovod and get the size of the cluster
    hvd.init()
    size = hvd.size()

    # Change 3 - Pin GPU to local process (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.Session(config=config))

    train_dataset = get_dataset(training_dir + '/train.tfrecords', batch_size)
    val_dataset = get_dataset(validation_dir + '/validation.tfrecords',
                              batch_size)
    eval_dataset = get_dataset(eval_dir + '/eval.tfrecords', batch_size)

    input_shape = (HEIGHT, WIDTH, DEPTH)

    # Change 6: Add callbacks for syncing initial state, and saving checkpoints only on 1st worker (rank 0)
    callbacks = []
    callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
    callbacks.append(hvd.callbacks.MetricAverageCallback())
    callbacks.append(
        hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1))
    callbacks.append(
        tf.keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1))
    if hvd.rank() == 0:
        callbacks.append(
            ModelCheckpoint(args.output_data_dir + '/checkpoint-{epoch}.h5'))
        logdir = args.output_data_dir + '/' + datetime.now().strftime(
            "%Y%m%d-%H%M%S")
        callbacks.append(TensorBoard(log_dir=logdir))
        callbacks.append(Sync2S3(logdir=logdir, s3logdir=tensorboard_logs))

    # To use ResNet model instead of custom model comment the above line and uncomment the following:
    model = get_resnet_model(input_shape, lr, weight_decay, optimizer,
                             momentum, hvd)

    # Train model
    # Change 7: Update the number of steps/epoch
    history = model.fit(
        train_dataset,
        steps_per_epoch=(NUM_TRAIN_IMAGES // batch_size) // size,
        validation_data=val_dataset,
        validation_steps=(NUM_VALID_IMAGES // batch_size) // size,
        verbose=1 if hvd.rank() == 0 else 0,
        epochs=epochs,
        callbacks=callbacks)

    # Evaluate model performance
    score = model.evaluate(eval_dataset,
                           steps=NUM_TEST_IMAGES // args.batch_size,
                           verbose=0)

    print('Test loss    :', score[0])
    print('Test accuracy:', score[1])

    if hvd.rank() == 0:
        save_history(args.output_data_dir + "/hvd_history.p", history)

예제 #10

0

파일 보기

def generate_cae(zero_pad_train,
                 zero_pad_test,
                 preproc,
                 dim_1,
                 dim_2,
                 train_mode,
                 hvd_mode=False):
    # Shuffle
    idx_train = np.arange(np.shape(zero_pad_train)[0])
    np.random.shuffle(idx_train)
    zero_pad_train = zero_pad_train[idx_train]

    # Just keeping a few aside for validation - due to memory limitations
    zero_pad_valid = zero_pad_train[-5:]
    zero_pad_train = zero_pad_train[:-5]

    idx_test = np.arange(np.shape(zero_pad_test)[0])
    np.random.shuffle(idx_test)
    zero_pad_test = zero_pad_test[idx_test]

    # CNN training stuff
    weights_filepath = "../CAE_Training/cae_best_weights.h5"
    lrate = 0.001

    # Get CAE model
    model, encoder, _ = cae_model()

    # design network
    my_adam = optimizers.Adam(lr=lrate,
                              beta_1=0.9,
                              beta_2=0.999,
                              epsilon=None,
                              decay=0.0,
                              amsgrad=False)

    if hvd_mode:
        my_adam = hvd.DistributedOptimizer(my_adam)

    earlystopping = EarlyStopping(monitor='val_loss',
                                  min_delta=0,
                                  patience=20,
                                  verbose=0,
                                  mode='auto',
                                  baseline=None,
                                  restore_best_weights=False)
    callbacks_list = [earlystopping]

    if hvd_mode:
        callbacks = [
            # Horovod: broadcast initial variable states from rank 0 to all other processes.
            # This is necessary to ensure consistent initialization of all workers when
            # training is started with random weights or restored from a checkpoint.
            hvd.callbacks.BroadcastGlobalVariablesCallback(0),

            # Horovod: average metrics among workers at the end of every epoch.
            #
            # Note: This callback must be in the list before the ReduceLROnPlateau,
            # TensorBoard or other metrics-based callbacks.
            hvd.callbacks.MetricAverageCallback(),

            # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
            # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
            # the first three epochs. See https://arxiv.org/abs/1706.02677 for details.
            hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=3,
                                                     verbose=1),
        ]

        # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
        if hvd.rank() == 0:
            callbacks_list = callbacks + callbacks_list
            checkpoint = ModelCheckpoint(weights_filepath,
                                         monitor='val_loss',
                                         verbose=1,
                                         save_best_only=True,
                                         mode='min',
                                         save_weights_only=True)
            callbacks_list.append(checkpoint)

        # Horovod: write logs on worker 0.
        verbose = 1 if hvd.rank() == 0 else 0
        model.compile(optimizer=my_adam,
                      loss='mean_squared_error',
                      metrics=[coeff_determination],
                      experimental_run_tf_function=False)
    else:
        model.compile(optimizer=my_adam,
                      loss='mean_squared_error',
                      metrics=[coeff_determination])
        checkpoint = ModelCheckpoint(weights_filepath,
                                     monitor='val_loss',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='min',
                                     save_weights_only=True)
        callbacks_list.append(checkpoint)

    model.summary()
    num_epochs = num_epochs_space

    # fit network
    if train_mode:

        if hvd_mode:
            # train_history = model.fit(x=zero_pad_train, y=zero_pad_train, epochs=num_epochs, callbacks=callbacks_list, batch_size=batchsize_space,\
            #                      validation_data=(zero_pad_valid,zero_pad_valid))

            if hvd.rank() == 0:
                model.load_weights(weights_filepath)

                idx_train = sorted(range(len(idx_train)),
                                   key=lambda k: idx_train[k])
                idx_test = sorted(range(len(idx_test)),
                                  key=lambda k: idx_test[k])

                # Rejoin train and valid
                zero_pad_train = np.concatenate(
                    (zero_pad_train, zero_pad_valid), axis=0)
                zero_pad_train = zero_pad_train[idx_train]
                zero_pad_test = zero_pad_test[idx_test]

                # Call to save latent space representation
                save_latent_space(model, encoder, zero_pad_train,
                                  zero_pad_test, preproc, dim_1, dim_2)

        else:
            train_history = model.fit(x=zero_pad_train, y=zero_pad_train,epochs=num_epochs, callbacks=callbacks_list, batch_size=batchsize_space,\
                                 validation_data=(zero_pad_valid,zero_pad_valid))

            model.load_weights(weights_filepath)

            idx_train = sorted(range(len(idx_train)),
                               key=lambda k: idx_train[k])
            idx_test = sorted(range(len(idx_test)), key=lambda k: idx_test[k])

            # Rejoin train and valid
            zero_pad_train = np.concatenate((zero_pad_train, zero_pad_valid),
                                            axis=0)
            zero_pad_train = zero_pad_train[idx_train]
            zero_pad_test = zero_pad_test[idx_test]

            # Call to save latent space representation
            save_latent_space(model, encoder, zero_pad_train, zero_pad_test,
                              preproc, dim_1, dim_2)

    return model

예제 #11

0

파일 보기

                        help='Integer. Number of epochs to train the model.',
                        default=3)
    parser.add_argument('--steps_per_epoch',
                        metavar='STEPS',
                        type=int,
                        help='Total number of steps (batches of samples)',
                        default=1000)
    return parser.parse_args()


def main(epochs, steps_per_epoch, hvd_rank=0, hvd_size=1):
    model = define_model()
    dataset = get_train_dataset(hvd_rank=hvd_rank, hvd_size=hvd_size)
    trained_model = train(model, dataset, epochs, steps_per_epoch, hvd_rank,
                          hvd_size)
    test_images, test_labels = get_test_dataset()
    trained_model.evaluate(test_images, test_labels, verbose=1)
    if hvd_rank == 0:
        trained_model.save('result')


if __name__ == '__main__':
    args = parse_args()
    hvd.init()
    main(
        epochs=args.epochs,
        steps_per_epoch=args.steps_per_epoch,
        hvd_rank=hvd.rank(),
        hvd_size=hvd.size(),
    )

예제 #12

0

파일 보기

파일: tensorflow2_mnist_data_service_train_fn_training_side_dispatcher.py 프로젝트: chongxiaoc/horovod

def train_fn(compute_config: TfDataServiceConfig, reuse_dataset: bool = False, round_robin: bool = False):
    # Horovod: initialize Horovod.
    hvd.init()

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    gpus = tf.config.experimental.list_physical_devices('GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')

    with tf_data_service(compute_config, hvd.rank()) as dispatcher_address:
        # this lock guarantees only one training task downloads the dataset
        with FileLock(os.path.expanduser("~/.horovod_lock")):
            (mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data()

        dataset = tf.data.Dataset.from_tensor_slices(
            (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32),
             tf.cast(mnist_labels, tf.int64))
        )

        # Allow tf.data service to pre-process the pipeline
        dataset = dataset.repeat() \
            .shuffle(10000) \
            .batch(128) \
            .apply(tf.data.experimental.service.distribute(
                service=dispatcher_address,
                processing_mode="distributed_epoch",
                job_name='job' if reuse_dataset else None,
                consumer_index=hvd.rank() if round_robin else None,
                num_consumers=hvd.size() if round_robin else None)) \
            .prefetch(tf.data.experimental.AUTOTUNE)

        mnist_model = tf.keras.Sequential([
            tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
            tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
            tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
            tf.keras.layers.Dropout(0.25),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(10, activation='softmax')
        ])

        # Horovod: adjust learning rate based on number of GPUs.
        scaled_lr = 0.001 * hvd.size()
        opt = tf.optimizers.Adam(scaled_lr)

        # Horovod: add Horovod DistributedOptimizer.
        opt = hvd.DistributedOptimizer(
            opt, backward_passes_per_step=1, average_aggregated_gradients=True)

        # Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow
        # uses hvd.DistributedOptimizer() to compute gradients.
        mnist_model.compile(loss=tf.losses.SparseCategoricalCrossentropy(),
                            optimizer=opt,
                            metrics=['accuracy'],
                            experimental_run_tf_function=False)

        callbacks = [
            hvd.callbacks.BroadcastGlobalVariablesCallback(0),
            hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=3, verbose=1),
        ]

        # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
        if hvd.rank() == 0:
            callbacks.append(tf.keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))

        # Horovod: write logs on worker 0.
        verbose = 1 if hvd.rank() == 0 else 0

        # Train the model.
        # Horovod: adjust number of steps based on number of GPUs.
        mnist_model.fit(dataset, steps_per_epoch=32 // hvd.size(), callbacks=callbacks, epochs=24, verbose=verbose)

예제 #13

0

파일 보기

    # Horovod: broadcast initial variable states from rank 0 to all other processes.
    # This is necessary to ensure consistent initialization of all workers when
    # training is started with random weights or restored from a checkpoint.
    hvd.callbacks.BroadcastGlobalVariablesCallback(0),

    # Horovod: average metrics among workers at the end of every epoch.
    #
    # Note: This callback must be in the list before the ReduceLROnPlateau,
    # TensorBoard or other metrics-based callbacks.
    hvd.callbacks.MetricAverageCallback(),

    # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
    # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
    # the first three epochs. See https://arxiv.org/abs/1706.02677 for details.
    # hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=1, initial_lr=scaled_lr, verbose=1),
]

# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
# if hvd.rank() == 0:
#     callbacks.append(tf.keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))

# Horovod: write logs on worker 0.
verbose = 1 if hvd.rank() == 0 else 0
validation_data = (test_images, test_labels) if hvd.rank() == 0 else None
validation_data = None

# Train the model.
# Horovod: adjust number of steps based on number of GPUs.
# steps_per_epoch=500 // hvd.size(),
model.fit(train_images, train_labels, validation_data=validation_data, epochs=3, batch_size=batch_size, \
        steps_per_epoch=barches_total // hvd.size(), callbacks=callbacks, verbose=verbose)

예제 #14

0

파일 보기

    def step(self,
             data_creator,
             epochs=1,
             verbose=1,
             callbacks=None,
             validation_data_creator=None,
             class_weight=None,
             steps_per_epoch=None,
             validation_steps=None,
             validation_freq=1,
             data_config=None):
        """Runs a training epoch and updates the model parameters."""
        config = self.config.copy()
        if data_config is not None:
            config.update(data_config)
        # process datasets
        if self.backend == "horovod":
            import horovod.tensorflow.keras as hvd
            assert "batch_size" in config, "batch_size must be set in config"
            config["batch_size"] = config["batch_size"] // hvd.size()
            train_dataset = data_creator(config)
            if validation_data_creator is not None:
                test_dataset = validation_data_creator(config)
            else:
                test_dataset = None
            from tensorflow.python.distribute.input_ops import auto_shard_dataset
            train_dataset = auto_shard_dataset(train_dataset, hvd.size(),
                                               hvd.rank())
            if test_dataset is not None:
                test_dataset = auto_shard_dataset(test_dataset, hvd.size(),
                                                  hvd.rank())
        elif self.backend == "tf-distributed":
            with self.strategy.scope():
                train_dataset = data_creator(config)
                if validation_data_creator is not None:
                    test_dataset = validation_data_creator(config)
                else:
                    test_dataset = None
        else:
            train_dataset = data_creator(config)
            if validation_data_creator is not None:
                test_dataset = validation_data_creator(config)
            else:
                test_dataset = None

        # process other arguments
        if self.backend == "horovod":
            import horovod.tensorflow.keras as hvd
            hvd_callbacks = [
                hvd.callbacks.BroadcastGlobalVariablesCallback(0),
                hvd.callbacks.MetricAverageCallback()
            ]
            if hvd.rank() != 0:
                verbose = 0

            if callbacks is not None:
                callbacks = hvd_callbacks + callbacks
            else:
                callbacks = hvd_callbacks
        elif self.backend == "tf-distributed":
            if self.strategy.cluster_resolver.task_id != 0:
                verbose = 0

        history = self.model.fit(train_dataset,
                                 epochs=self.epoch + epochs,
                                 verbose=verbose,
                                 callbacks=callbacks,
                                 validation_data=test_dataset,
                                 class_weight=class_weight,
                                 initial_epoch=self.epoch,
                                 steps_per_epoch=steps_per_epoch,
                                 validation_steps=validation_steps,
                                 validation_freq=validation_freq)
        if history is None:
            stats = {}
        else:
            stats = {"train_" + k: v[-1] for k, v in history.history.items()}

        self.epoch += epochs
        return stats

예제 #15

0

파일 보기

 def on_epoch_end(self, epoch, logs=None):
     epoch_run_time = time.time() - self.epoch_start
     if hvd.rank() == 0:
         print("epoch: %d time_taken: %.1f" % (epoch, epoch_run_time))

예제 #16

0

파일 보기

def run(config):
    seed = config["seed"]
    if seed is not None:
        np.random.seed(seed)
        if tf.__version__ == "1.13.1":
            tf.random.set_random_seed(seed)
        else:
            tf.compat.v2.random.set_seed(seed)

    load_config(config)

    input_shape, output_shape = setup_data(config)

    search_space = setup_search_space(config,
                                      input_shape,
                                      output_shape,
                                      seed=seed)

    # Initialize Horovod
    hvd.init()

    model_created = False
    try:
        model = search_space.create_model()
        model_created = True
    except:
        logger.info("Error: Model creation failed...")
        logger.info(traceback.format_exc())

    if model_created:

        # Setup callbacks only
        callbacks = []
        cb_requires_valid = False  # Callbacks requires validation data
        callbacks_config = config["hyperparameters"].get("callbacks")
        if callbacks_config is not None:
            for cb_name, cb_conf in callbacks_config.items():
                if cb_name in default_callbacks_config:
                    # cb_bame in hvd_root_cb implies hvd.rank() == 0
                    if not (cb_name in hvd_root_cb) or hvd.rank() == 0:
                        default_callbacks_config[cb_name].update(cb_conf)

                        # Special dynamic parameters for callbacks
                        if cb_name == "ModelCheckpoint":
                            default_callbacks_config[cb_name][
                                "filepath"] = f'best_model_{config["id"]}.h5'

                        # Import and create corresponding callback
                        Callback = getattr(keras.callbacks, cb_name)
                        callbacks.append(
                            Callback(**default_callbacks_config[cb_name]))

                        if cb_name in ["EarlyStopping"]:
                            cb_requires_valid = "val" in cb_conf[
                                "monitor"].split("_")
                else:
                    logger.error(f"'{cb_name}' is not an accepted callback!")

        trainer = HorovodTrainerTrainValid(config=config, model=model)
        callbacks.append(
            # Horovod: broadcast initial variable states from rank 0 to all other processes.
            # This is necessary to ensure consistent initialization of all workers when
            # training is started with random weights or restored from a checkpoint.
            hvd.callbacks.BroadcastGlobalVariablesCallback(0))
        trainer.callbacks.extend(callbacks)

        last_only, with_pred = preproc_trainer(config)
        last_only = last_only and not cb_requires_valid

        history = trainer.train(with_pred=with_pred, last_only=last_only)

        result = compute_objective(config["objective"], history)
    else:
        # penalising actions if model cannot be created
        result = -1
    if result < -10:
        result = -10
    return result

예제 #17

0

파일 보기

def train(model_func, params):
    image_width = params['image_width']
    image_height = params['image_height']
    image_format = params['image_format']
    distort_color = params['distort_color']
    momentum = params['momentum']
    loss_scale = params['loss_scale']
    data_dir = params['data_dir']
    data_idx_dir = params['data_idx_dir']
    batch_size = params['batch_size']
    num_iter = params['num_iter']
    iter_unit = params['iter_unit']
    log_dir = params['log_dir']
    export_dir = params['export_dir']
    tensorboard_dir = params['tensorboard_dir']
    display_every = params['display_every']
    precision = params['precision']
    dali_mode = params['dali_mode']
    use_xla = params['use_xla']

    if data_dir is not None:
        file_format = os.path.join(data_dir, '%s-*')
        train_files = sorted(tf.io.gfile.glob(file_format % 'train'))
        valid_files = sorted(tf.io.gfile.glob(file_format % 'validation'))
        num_train_samples = common.get_num_records(train_files)
        num_valid_samples = common.get_num_records(valid_files)
    else:
        num_train_samples = 1281982
        num_valid_samples = 5000

    train_idx_files = None
    valid_idx_files = None
    if data_idx_dir is not None:
        file_format = os.path.join(data_idx_dir, '%s-*')
        train_idx_files = sorted(tf.io.gfile.glob(file_format % 'train'))
        valid_idx_files = sorted(tf.io.gfile.glob(file_format % 'validation'))

    if iter_unit.lower() == 'epoch':
        num_epochs = num_iter
        nstep_per_epoch = num_train_samples // (batch_size * hvd.size())
        nstep_per_valid = num_valid_samples // (batch_size * hvd.size())
    else:
        assert iter_unit.lower() == 'batch'
        num_epochs = 1
        nstep_per_epoch = min(num_iter,
                              num_train_samples // (batch_size * hvd.size()))
        nstep_per_valid = min(10,
                              num_valid_samples // (batch_size * hvd.size()))

    initial_epoch = 0
    if log_dir:
        # We save check points only when using the real data.
        assert data_dir, "--data_dir cannot be empty when using --log_dir"
        assert os.path.exists(log_dir)
        ckpt_format = log_dir + "/model-{epoch:02d}-{val_top1:.2f}.hdf5"
        # Looks for the most recent checkpoint and sets the initial epoch from it.
        for filename in os.listdir(log_dir):
            if filename.startswith('model-'):
                initial_epoch = max(int(re.findall(r'\d+', filename)[0]),
                                    initial_epoch)

    if tensorboard_dir:
        assert os.path.exists(tensorboard_dir)

    if export_dir:
        assert os.path.exists(export_dir)
        save_format = export_dir + "/saved_model_rn50.h5"

    if use_xla:
        tf.config.optimizer.set_jit(True)

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    gpus = tf.config.experimental.list_physical_devices('GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                   'GPU')

    if precision == 'fp16':
        if tf.__version__ >= "2.4.0":
            policy = keras.mixed_precision.Policy('mixed_float16')
            keras.mixed_precision.set_global_policy(policy)
        else:
            policy = keras.mixed_precision.experimental.Policy(
                'mixed_float16', loss_scale)
            keras.mixed_precision.experimental.set_policy(policy)

    lr_schedule = common.create_piecewise_constant_decay_with_warmup(
        batch_size=batch_size * hvd.size(),
        epoch_size=num_train_samples,
        warmup_epochs=common.LR_SCHEDULE[0][1],
        boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]),
        multipliers=list(p[0] for p in common.LR_SCHEDULE),
        compute_lr_on_cpu=True)
    opt = keras.optimizers.SGD(learning_rate=lr_schedule, momentum=momentum)
    # Horovod: add Horovod DistributedOptimizer. We use a modified version to
    # support the custom learning rate schedule.
    opt = hvd.DistributedOptimizer(opt)
    if tf.__version__ >= "2.4.0" and precision == 'fp16':
        opt = keras.mixed_precision.LossScaleOptimizer(
            opt, dynamic=False, initial_scale=loss_scale)

    backend.set_image_data_format(image_format)
    dtype = 'float16' if precision == 'fp16' else 'float32'
    backend.set_floatx(dtype)
    model = model_func(num_classes=image_processing.NUM_CLASSES)
    loss_func = 'sparse_categorical_crossentropy',

    top5 = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5, name='top5')
    top1 = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=1, name='top1')

    # Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow
    # uses hvd.DistributedOptimizer() to compute gradients. However, this option
    # will disable the overlapping of the data loading and compute and hurt the
    # performace if the model is not under the scope of distribution strategy
    # scope.
    model.compile(optimizer=opt,
                  loss=loss_func,
                  metrics=[top1, top5],
                  experimental_run_tf_function=False)

    training_hooks = []
    training_hooks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
    training_hooks.append(_ProfileKerasFitCallback(batch_size, display_every))

    if log_dir and hvd.rank() == 0:
        ckpt_callback = keras.callbacks.ModelCheckpoint(
            ckpt_format,
            monitor='val_top1',
            verbose=1,
            save_best_only=False,
            save_weights_only=False,
            save_frequency=1)
        training_hooks.append(ckpt_callback)

    if tensorboard_dir and hvd.rank() == 0:
        tensorboard_callback = tf.keras.callbacks.TensorBoard(
            log_dir=tensorboard_dir)
        training_hooks.append(tensorboard_callback)

    if data_dir is not None:
        num_preproc_threads = params['dali_threads'] if dali_mode else 10
        train_input = image_processing.image_set(
            train_files,
            batch_size,
            image_height,
            image_width,
            training=True,
            distort_color=distort_color,
            deterministic=False,
            num_threads=num_preproc_threads,
            use_dali=dali_mode,
            idx_filenames=train_idx_files)

        valid_input = image_processing.image_set(
            valid_files,
            batch_size,
            image_height,
            image_width,
            training=False,
            distort_color=False,
            deterministic=False,
            num_threads=num_preproc_threads,
            use_dali=dali_mode,
            idx_filenames=valid_idx_files)
        if dali_mode:
            train_input = train_input.get_device_dataset()
            valid_input = valid_input.get_device_dataset()
        valid_params = {
            'validation_data': valid_input,
            'validation_steps': nstep_per_valid,
            'validation_freq': 1
        }
    else:
        train_input = image_processing.fake_image_set(batch_size, image_height,
                                                      image_width)
        valid_params = {}

    try:
        verbose = 2 if hvd.rank() == 0 else 0
        model.fit(train_input,
                  epochs=num_epochs,
                  callbacks=training_hooks,
                  steps_per_epoch=nstep_per_epoch,
                  verbose=verbose,
                  initial_epoch=initial_epoch,
                  **valid_params)
    except KeyboardInterrupt:
        print("Keyboard interrupt")

    if export_dir and hvd.rank() == 0:
        model.save(save_format)
        print(f"The model is saved to {save_format}")

예제 #18

0

파일 보기

파일: hvd_train.py 프로젝트: feratur/tensorflow2-horovod-example

    layers.Dense(64, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(10, activation='softmax')
])

opt = hvd.DistributedOptimizer(tf.optimizers.Adam(0.01),
                               backward_passes_per_step=1,
                               average_aggregated_gradients=True)

model.compile(optimizer=opt,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)]

if hvd.rank() == 0:
    callbacks.append(
        tf.keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))

datagen = ImageDataGenerator(horizontal_flip=True)
model.fit(datagen.flow(x_train, y_train, batch_size=8),
          callbacks=callbacks,
          epochs=3,
          verbose=(hvd.rank() == 0))

if hvd.rank() == 0:
    test_data = np.load('data_test.npz')
    x_test, y_test = test_data['x_test'], test_data['y_test']
    preds = model.predict(x_test)
    acc_score = accuracy_score(y_test[:, 0], np.argmax(preds, axis=1))
    print(f'Model accuracy is {acc_score}')

예제 #19

0

파일 보기

파일: horovod_keras_mnist.py 프로젝트: vandanavk/sagemaker-debugger

def main(args):
    # Horovod: initialize Horovod.
    hvd.init()

    if not args.use_only_cpu:
        # Horovod: pin GPU to be used to process local rank (one GPU per process)
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(hvd.local_rank())
    else:
        config = None

    K.set_session(tf.Session(config=config))

    batch_size = 128
    num_classes = 10

    # Horovod: adjust number of epochs based on number of GPUs.
    epochs = int(math.ceil(args.num_epochs / hvd.size()))

    # Input image dimensions
    img_rows, img_cols = 28, 28

    # The data, shuffled and split between train and test sets
    (x_train, y_train), (x_test, y_test) = mnist.load_data()

    if K.image_data_format() == "channels_first":
        x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
        x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
        input_shape = (1, img_rows, img_cols)
    else:
        x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
        x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
        input_shape = (img_rows, img_cols, 1)

    x_train = x_train.astype("float32")
    x_test = x_test.astype("float32")
    x_train /= 255
    x_test /= 255
    print("x_train shape:", x_train.shape)
    print(x_train.shape[0], "train samples")
    print(x_test.shape[0], "test samples")

    # Convert class vectors to binary class matrices
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)

    model = Sequential()
    model.add(
        Conv2D(32,
               kernel_size=(3, 3),
               activation="relu",
               input_shape=input_shape))
    model.add(Conv2D(64, (3, 3), activation="relu"))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(128, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation="softmax"))

    # Horovod: adjust learning rate based on number of GPUs.
    opt = keras.optimizers.Adadelta(1.0 * hvd.size())

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer=opt,
                  metrics=["accuracy"])

    callbacks = [
        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        hvd.callbacks.BroadcastGlobalVariablesCallback(0)
    ]

    # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
    if hvd.rank() == 0:
        callbacks.append(
            keras.callbacks.ModelCheckpoint(
                os.path.join(args.model_dir, "checkpoint-{epoch}.h5")))

    model.fit(
        x_train,
        y_train,
        batch_size=batch_size,
        callbacks=callbacks,
        epochs=epochs,
        verbose=1 if hvd.rank() == 0 else 0,
        validation_data=(x_test, y_test),
    )
    score = model.evaluate(x_test, y_test, verbose=0)
    print("Test loss:", score[0])
    print("Test accuracy:", score[1])

예제 #20

0

파일 보기

 def get_gradients(self, loss, params):
     assert len(params) == 1
     return [tf.constant([float(hvd.rank())])]

예제 #21

0

파일 보기

def main(args):

    mpi = False
    if 'sourcedir.tar.gz' in args.tensorboard_dir:
        tensorboard_dir = re.sub('source/sourcedir.tar.gz', 'model',
                                 args.tensorboard_dir)
    else:
        tensorboard_dir = args.tensorboard_dir
    logging.info("Writing TensorBoard logs to {}".format(tensorboard_dir))
    if 'sagemaker_mpi_enabled' in args.fw_params:
        if args.fw_params['sagemaker_mpi_enabled']:
            import horovod.tensorflow.keras as hvd
            mpi = True
            # Horovod: initialize Horovod.
            hvd.init()

            # Horovod: pin GPU to be used to process local rank (one GPU per process)
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            config.gpu_options.visible_device_list = str(hvd.local_rank())
            K.set_session(tf.Session(config=config))
    else:
        hvd = None

    logging.info("Running with MPI={}".format(mpi))
    logging.info("getting data")
    train_dataset = train_input_fn()
    eval_dataset = eval_input_fn()
    validation_dataset = validation_input_fn()

    logging.info("configuring model")
    model = keras_model_fn(args.learning_rate, args.weight_decay,
                           args.optimizer, args.momentum, mpi, hvd)
    callbacks = []
    if mpi:
        callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
        callbacks.append(hvd.callbacks.MetricAverageCallback())
        callbacks.append(
            hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5,
                                                     verbose=1))
        callbacks.append(
            tf.keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1))
        if hvd.rank() == 0:
            callbacks.append(
                ModelCheckpoint(args.output_dir + '/checkpoint-{epoch}.h5'))
            callbacks.append(
                CustomTensorBoardCallback(log_dir=tensorboard_dir))
    else:
        callbacks.append(
            ModelCheckpoint(args.output_dir + '/checkpoint-{epoch}.h5'))
        callbacks.append(CustomTensorBoardCallback(log_dir=tensorboard_dir))
    logging.info("Starting training")
    size = 1
    if mpi:
        size = hvd.size()
    model.fit(
        x=train_dataset[0],
        y=train_dataset[1],
        steps_per_epoch=(num_examples_per_epoch('train') // args.batch_size) //
        size,
        epochs=args.epochs,
        validation_data=validation_dataset,
        validation_steps=(num_examples_per_epoch('validation') //
                          args.batch_size) // size,
        callbacks=callbacks)

    score = model.evaluate(eval_dataset[0],
                           eval_dataset[1],
                           steps=num_examples_per_epoch('eval') //
                           args.batch_size,
                           verbose=0)

    logging.info('Test loss:{}'.format(score[0]))
    logging.info('Test accuracy:{}'.format(score[1]))

    # Horovod: Save model only on worker 0 (i.e. master)
    if mpi:
        if hvd.rank() == 0:
            return save_model(model, args.model_output_dir)
    else:
        return save_model(model, args.model_output_dir)

예제 #22

0

파일 보기

    def test_elastic_state(self):
        with self.test_session(config=self.config) as sess:
            K.set_session(sess)

            v = 1.0 if hvd.rank() == 0 else 2.0
            model1 = tf.keras.Sequential(
                [tf.keras.layers.Dense(2, activation='softmax')])
            model1.build((2, 2))
            model1.set_weights([
                np.array([[v, v], [v, v]], dtype=np.float32),
                np.array([v, v], dtype=np.float32)
            ])

            model2 = tf.keras.Sequential(
                [tf.keras.layers.Dense(2, activation='softmax')])
            model2.build((2, 2))
            model2.set_weights([
                np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
                np.array([0.0, 0.0], dtype=np.float32)
            ])

            optimizer = tf.keras.optimizers.Adam(0.001 * hvd.size())

            state = hvd.elastic.KerasState(model1,
                                           optimizer,
                                           batch=20 + hvd.rank(),
                                           epoch=10 + hvd.rank())
            state.sync()

            model1_weights = model1.get_weights()
            model2_weights = model2.get_weights()

            # After sync, all values should match the root rank
            for w in state.model.get_weights():
                self.assertAllClose(w, np.ones_like(w))
            assert state.batch == 20
            assert state.epoch == 10

            # Partially modify then restore
            model1.set_weights(model2_weights)
            state.batch = 21
            state.epoch = 11

            state.restore()

            for w1, w2 in zip(model1.get_weights(), model1_weights):
                self.assertAllClose(w1, w2)
            assert state.batch == 20
            assert state.epoch == 10

            # Partially modify then commit
            model1.set_weights(model2_weights)
            state.batch = 21
            state.epoch = 11

            state.commit()
            state.restore()

            for w1, w2 in zip(model1.get_weights(), model2_weights):
                self.assertAllClose(w1, w2)
            assert state.batch == 21
            assert state.epoch == 11

예제 #23

0

파일 보기

                    default='cpu',
                    help='Wheter this is running on cpu or gpu')
parser.add_argument('--num_inter',
                    default=2,
                    help='set number inter',
                    type=int)
parser.add_argument('--num_intra',
                    default=0,
                    help='set number intra',
                    type=int)

args = parser.parse_args()

# Horovod: pin GPU to be used to process local rank (one GPU per process)

print("I am rank %s of %s" % (hvd.rank(), hvd.size()))
# Horovod: pin GPU to be used to process local rank (one GPU per process)
if args.device == 'cpu':
    tf.config.threading.set_intra_op_parallelism_threads(args.num_intra)
    tf.config.threading.set_inter_op_parallelism_threads(args.num_inter)
else:
    gpus = tf.config.experimental.list_physical_devices('GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                   'GPU')

(mnist_images, mnist_labels), _ = \
    tf.keras.datasets.mnist.load_data(path='mnist.npz')

예제 #24

0

파일 보기

def train(model,
          train_images,
          train_annotations,
          input_height=None,
          input_width=None,
          n_classes=None,
          verify_dataset=True,
          checkpoints_path=None,
          epochs=5,
          batch_size=2,
          validate=False,
          val_images=None,
          val_annotations=None,
          auto_resume_checkpoint=False,
          load_weights=None,
          steps_per_epoch=None,
          val_steps_per_epoch=None,
          gen_use_multiprocessing=False,
          ignore_zero_class=False,
          optimizer_name='adam',
          do_augment=False,
          augmentation_name="aug_all",
          data_type='fp32',
          tb_location=None,
          deterministic=False,
          model_dir=None,
          dump_config=None,
          distributed=False,
          use_upsampling=False,
          loss_type=0,
          train_engine='hpu',
          not_cached=False):

    if train_engine == 'hpu':
        from habana_frameworks.tensorflow import load_habana_module
        load_habana_module()
        print("Loaded HPU modules")
        from TensorFlow.common.debug import dump_callback
        # For Habana Model runner hooks
        from TensorFlow.common.tb_utils import (TensorBoardWithHParamsV2,
                                                ExamplesPerSecondKerasHookV2)
    else:

        class dump_callback(object):
            def __init__(self, file_name):
                pass

            def __enter__(self):
                pass

            def __exit__(self, type, value, traceback):
                pass

    if data_type == 'bf16' and train_engine == 'hpu':
        bf16_json = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                 '../bf16_segnet.json')
        os.environ['TF_BF16_CONVERSION'] = os.environ.get(
            'TF_BF16_CONVERSION', bf16_json)
        print("Setting BF16:", os.getenv('TF_BF16_CONVERSION'))

    shard_id = 0
    num_shards = 1

    if distributed:
        import horovod.tensorflow.keras as hvd
        print("hvd init")
        hvd.init()
        if train_engine == 'gpu':
            gpus = tf.config.experimental.list_physical_devices('GPU')
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            if gpus:
                tf.config.experimental.set_visible_devices(
                    gpus[hvd.local_rank()], 'GPU')
            print("Set memory growth for GPUS")

        shard_id = hvd.rank()
        num_shards = hvd.size()
        if num_shards == 1:
            print(
                "Distributed training requested but horovod init not success")
            exit()

    print("num_shards: " + str(num_shards) + " shard_id: " + str(shard_id))

    from keras_segmentation.models.all_models import model_from_name
    # check if user gives model name instead of the model object
    if isinstance(model, six.string_types):
        # create the model from the name
        assert (n_classes is not None), "Please provide the n_classes"
        if (input_height is not None) and (input_width is not None):
            model = model_from_name[model](n_classes,
                                           input_height=input_height,
                                           input_width=input_width,
                                           batch_size=batch_size,
                                           use_upsampling=use_upsampling,
                                           loss_type=loss_type)
        else:
            model = model_from_name[model](n_classes,
                                           batch_size=batch_size,
                                           use_upsampling=use_upsampling,
                                           loss_type=loss_type)

    #model.save('my_segnet_model.h5')
    n_classes = model.n_classes
    input_height = model.input_height
    input_width = model.input_width
    output_height = model.output_height
    output_width = model.output_width

    if steps_per_epoch is None:
        steps_per_epoch = len(
            os.listdir(train_images)) // (batch_size * num_shards)
    if val_steps_per_epoch is None:
        val_steps_per_epoch = len(os.listdir(val_images)) // batch_size

    print("Steps per epoch: " + str(steps_per_epoch))

    def optimized_xent_loss_custom_grad(ytrue, ypred):
        @tf.custom_gradient
        def loss_without_mean(ytrue, ypred):
            with tf.name_scope("softmax_cross_entropy"):
                logits_t = tf.transpose(ypred,
                                        perm=(0, 1, 3, 2),
                                        name="logits_t")  # BS H N W
                reduce_max = tf.reduce_max(logits_t, 2,
                                           name="reduce_max")  # BS H W
                max_logits = tf.expand_dims(reduce_max, 3)  # BS H W 1
                shifted_logits = tf.subtract(ypred,
                                             max_logits,
                                             name="shifted_logits")  # BS H W N
                exp_shifted_logits = tf.math.exp(
                    shifted_logits, name="exp_shifted_logits")  # BS H W N
                reduce_sum_filter = tf.fill([1, 1, n_classes, 1], 1.0)
                sum_exp = tf.nn.conv2d(exp_shifted_logits,
                                       reduce_sum_filter,
                                       strides=1,
                                       padding="VALID",
                                       name="sum_exp")  # BS H W 1
                log_sum_exp = tf.math.log(sum_exp,
                                          name="log_sum_exp")  # BS H W 1
                shifted_logits2 = tf.nn.conv2d(
                    shifted_logits * ytrue,
                    reduce_sum_filter,
                    strides=1,
                    padding="VALID",
                    name="shifted_logits2")  # BS H W 1
                loss = tf.subtract(log_sum_exp,
                                   shifted_logits2,
                                   name="loss/sub")  # BS H W 1

                def custom_grad(dy):  # dy is BS H W 1
                    with tf.name_scope("gradients/softmax_cross_entropy"):
                        div = tf.math.truediv(exp_shifted_logits,
                                              sum_exp,
                                              name="div")  # BS H W N
                        sub = tf.math.subtract(div, ytrue,
                                               name="sub")  # BS H W N
                        ret = tf.math.multiply(sub, dy, name="mul")
                    return -dy * shifted_logits, ret

                return loss, custom_grad

        return tf.math.reduce_mean(loss_without_mean(ytrue, ypred))

    if validate:
        assert val_images is not None
        assert val_annotations is not None

    if optimizer_name is not None:

        if ignore_zero_class:
            loss_k = masked_categorical_crossentropy
        elif loss_type == 1:
            loss_k = tf.keras.losses.SparseCategoricalCrossentropy(
                from_logits=True)
        elif loss_type == 2:
            loss_k = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
        else:
            loss_k = optimized_xent_loss_custom_grad

        print(optimizer_name)
        if num_shards > 1:
            optimizer = Adam(lr=LearningRate)
            optimizer_name = hvd.DistributedOptimizer(optimizer)

        model.compile(loss=loss_k,
                      optimizer=optimizer_name,
                      metrics=['accuracy'])

    if checkpoints_path is not None:
        with open(checkpoints_path + "_config.json", "w") as f:
            json.dump(
                {
                    "model_class": model.model_name,
                    "n_classes": n_classes,
                    "input_height": input_height,
                    "input_width": input_width,
                    "output_height": output_height,
                    "output_width": output_width
                }, f)

    if load_weights is not None and len(load_weights) > 0:
        print("Loading weights from ", load_weights)
        status = model.load_weights(load_weights)
        print(status)

    if auto_resume_checkpoint and (checkpoints_path is not None):
        latest_checkpoint = find_latest_checkpoint(checkpoints_path)
        if latest_checkpoint is not None:
            print("Loading the weights from latest checkpoint ",
                  latest_checkpoint)
            model.load_weights(latest_checkpoint)

    if verify_dataset:
        print("Verifying training dataset")
        verified = verify_segmentation_dataset(train_images, train_annotations,
                                               n_classes, deterministic)
        assert verified
        if validate:
            print("Verifying validation dataset")
            verified = verify_segmentation_dataset(val_images, val_annotations,
                                                   n_classes, deterministic)
            assert verified

    if not_cached:
        train_gen = image_segmentation_generator(
            train_images,
            train_annotations,
            batch_size,
            n_classes,
            input_height,
            input_width,
            output_height,
            output_width,
            deterministic,
            do_augment=do_augment,
            augmentation_name=augmentation_name,
            num_shards=num_shards,
            shard_id=shard_id,
            loss_type=loss_type)
    else:
        train_gen = image_segmentation_generator(
            train_images,
            train_annotations,
            1,
            n_classes,
            input_height,
            input_width,
            output_height,
            output_width,
            deterministic,
            do_augment=do_augment,
            augmentation_name=augmentation_name,
            num_shards=num_shards,
            shard_id=shard_id,
            loss_type=loss_type)

        train_gen = cached_image_generator(train_gen, num_shards, shard_id,
                                           batch_size,
                                           len(os.listdir(train_images)),
                                           deterministic)

    callbacks = []

    if num_shards > 1:
        callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
        callbacks.append(hvd.callbacks.MetricAverageCallback())

    callbacks.append(CheckpointsCallback(checkpoints_path))
    #if shard_id == 0:
    #    callbacks.append(ModelCheckpoint( self.checkpoints_path, monitor='loss', verbose=2, mode='min', save_best_only=True, save_weights_only=True))

    if model_dir is not None:
        hparams = {
            "model_name": model,
            "optimizer": optimizer_name,
            "batch_size": batch_size
        }

        if train_engine == 'hpu':
            callbacks += [
                TensorBoardWithHParamsV2(hparams,
                                         log_dir=model_dir,
                                         update_freq=5),
                ExamplesPerSecondKerasHookV2(5,
                                             batch_size=batch_size,
                                             output_dir=model_dir)
            ]

    if tb_location != '':
        tensorboard_callback = TensorBoard(log_dir=tb_location,
                                           histogram_freq=1)
        callbacks.append(tensorboard_callback)
        print("TB:", tb_location)

    if not validate:
        with dump_callback(dump_config):
            start_compilation = time.time()
            model.fit(train_gen, steps_per_epoch=1, epochs=1)
            stop_compilation = time.time()
            history = model.fit(train_gen,
                                steps_per_epoch=steps_per_epoch,
                                epochs=epochs,
                                callbacks=callbacks,
                                verbose=1 if shard_id == 0 else 0)
            stop_training = time.time()
        with open('./trainHistoryDict_' + str(shard_id), 'wb') as file_pi:
            pickle.dump(history.history, file_pi)
        avg_time_per_batch = (stop_training -
                              stop_compilation) / (steps_per_epoch * epochs)
        print('Compile time in seconds:',
              (stop_compilation - start_compilation))
        print('Average time per batch in seconds (leaving out compilation):',
              avg_time_per_batch)
        print('Average time per image in seconds (leaving out compilation)',
              avg_time_per_batch / batch_size)
        print('Average images per sec (leaving out compilation):',
              batch_size / avg_time_per_batch)

        if loss_type == 1:
            print('Eval for LOSS_FUNC_TYPE=1 is WIP')
            exit()

        if shard_id == 0:
            if not_cached:
                val_gen = image_segmentation_generator(val_images,
                                                       val_annotations,
                                                       batch_size,
                                                       n_classes,
                                                       input_height,
                                                       input_width,
                                                       output_height,
                                                       output_width,
                                                       deterministic,
                                                       num_shards=1,
                                                       shard_id=shard_id,
                                                       loss_type=loss_type)
            else:
                val_gen = image_segmentation_generator(val_images,
                                                       val_annotations,
                                                       1,
                                                       n_classes,
                                                       input_height,
                                                       input_width,
                                                       output_height,
                                                       output_width,
                                                       deterministic,
                                                       num_shards=1,
                                                       shard_id=shard_id,
                                                       loss_type=loss_type)
                val_gen = cached_image_generator(val_gen, 1, 0, batch_size,
                                                 len(os.listdir(val_images)))
            f1_metric = FBetaScore(num_classes=n_classes)
            model.compile(loss=model.loss,
                          metrics=[
                              tf.keras.metrics.CategoricalAccuracy(
                                  name="categorical_accuracy", dtype=None),
                              f1_metric
                          ])
            test_loss, test_acc, test_f1 = model.evaluate(
                val_gen, steps=(len(os.listdir(val_images)) // batch_size))
            train_loss, train_acc, train_f1 = model.evaluate(
                train_gen, steps=(len(os.listdir(train_images)) // batch_size))
            print(
                f'test loss : {test_loss}, test accuracy : {test_acc}, test f1 : {test_f1}'
            )
            print(
                f'train loss : {train_loss}, train accuracy : {train_acc}, train f1 : {train_f1}'
            )

    else:
        assert (
            num_shards is
            1), "Only support training with validation with single HPU setup"
        if not_cached:
            val_gen = image_segmentation_generator(val_images,
                                                   val_annotations,
                                                   batch_size,
                                                   n_classes,
                                                   input_height,
                                                   input_width,
                                                   output_height,
                                                   output_width,
                                                   deterministic,
                                                   num_shards=num_shards,
                                                   shard_id=shard_id,
                                                   loss_type=loss_type)
        else:
            val_gen = image_segmentation_generator(val_images,
                                                   val_annotations,
                                                   1,
                                                   n_classes,
                                                   input_height,
                                                   input_width,
                                                   output_height,
                                                   output_width,
                                                   deterministic,
                                                   num_shards=num_shards,
                                                   shard_id=shard_id,
                                                   loss_type=loss_type)
            val_gen = cached_image_generator(val_gen, num_shards, shard_id,
                                             batch_size,
                                             len(os.listdir(val_images)),
                                             deterministic)

        start_compilation = time.time()
        model.fit(train_gen, steps_per_epoch=1, epochs=1)
        stop_compilation = time.time()
        model.fit(train_gen,
                  steps_per_epoch=steps_per_epoch,
                  validation_data=val_gen,
                  validation_steps=val_steps_per_epoch,
                  epochs=epochs,
                  callbacks=callbacks,
                  use_multiprocessing=gen_use_multiprocessing,
                  verbose=1 if shard_id == 0 else 0)
        stop_training = time.time()
        avg_time_per_batch = (stop_training -
                              stop_compilation) / (steps_per_epoch * epochs)
        print('Compile time in seconds:',
              (stop_compilation - start_compilation))
        print('Average time per batch in seconds (leaving out compilation):',
              avg_time_per_batch)
        print('Average time per image in seconds (leaving out compilation)',
              avg_time_per_batch / batch_size)

예제 #25

0

파일 보기

import tensorflow as tf
import horovod.tensorflow.keras as hvd

# Horovod: initialize Horovod.
hvd.init()

# Horovod: pin GPU to be used to process local rank (one GPU per process)
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
if gpus:
    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')

(mnist_images, mnist_labels), _ = \
    tf.keras.datasets.mnist.load_data(path='mnist-%d.npz' % hvd.rank())

dataset = tf.data.Dataset.from_tensor_slices(
    (tf.cast(mnist_images[..., tf.newaxis] / 255.0,
             tf.float32), tf.cast(mnist_labels, tf.int64)))
dataset = dataset.repeat().shuffle(10000).batch(128)

mnist_model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
    tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(10, activation='softmax')

예제 #26

0

파일 보기

파일: profile_train.py 프로젝트: liaojiangwen/design-tutorial

import horovod.tensorflow.keras as hvd
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam

from argparse import ArgumentParser
from datetime import datetime
from time import perf_counter
from tqdm import tqdm
import json
import os

# Get the rank information
hvd.init()
rank = hvd.rank()
size = hvd.size()

# Hard-coded paths for data and model
_data_path = os.path.join('..', '..', 'data', 'output', 'water_clusters.proto')
_model_path = os.path.join('..', 'model.h5')

if __name__ == "__main__":
    # Parse the arguments
    arg_parser = ArgumentParser()
    arg_parser.add_argument('--batch-sizes',
                            '-b',
                            nargs='*',
                            default=[32],
                            help='Batch size for each rank',
                            type=int)

예제 #27

0

파일 보기

ROOT = '/mnt/bb/$USERID'

#load Keras model
model = tf.keras.applications.DenseNet169(weights=None,
                                          include_top=True,
                                          input_shape=(IMG_SIZE, IMG_SIZE, 3),
                                          classes=2)

# compile the model
model.compile(loss="categorical_crossentropy",
              optimizer=hvd.DistributedOptimizer(
                  tf.keras.optimizers.Adam(lr=LRATE * hvd.size())),
              metrics=["accuracy"],
              experimental_run_tf_function=False)

if hvd.rank() == 0:
    print(model.summary())

verbose = 1 if hvd.rank() == 0 else 0

cbs = [
    hvd.callbacks.BroadcastGlobalVariablesCallback(0),
    hvd.callbacks.MetricAverageCallback(),
    hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=WUP,
                                             verbose=verbose),
    # Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs.
    hvd.callbacks.LearningRateScheduleCallback(start_epoch=WUP,
                                               end_epoch=WUP + 10,
                                               multiplier=1.),
    hvd.callbacks.LearningRateScheduleCallback(start_epoch=WUP + 10,
                                               end_epoch=WUP + 20,

예제 #28

0

파일 보기

    def train_fn(model_bytes):
        # Make sure pyarrow is referenced before anything else to avoid segfault due to conflict
        # with TensorFlow libraries.  Use `pa` package reference to ensure it's loaded before
        # functions like `deserialize_model` which are implemented at the top level.
        # See https://jira.apache.org/jira/browse/ARROW-3346
        pa

        import atexit
        import horovod.tensorflow.keras as hvd
        import os
        from petastorm import make_batch_reader
        from petastorm.tf_utils import make_petastorm_dataset
        import tempfile
        import tensorflow as tf
        import tensorflow.keras.backend as K
        import shutil

        # Horovod: initialize Horovod inside the trainer.
        hvd.init()

        # Horovod: pin GPU to be used to process local rank (one GPU per process), if GPUs are available.
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        K.set_session(tf.Session(config=config))

        # Horovod: restore from checkpoint, use hvd.load_model under the hood.
        model = deserialize_model(model_bytes, hvd.load_model)

        # Horovod: adjust learning rate based on number of processes.
        scaled_lr = K.get_value(model.optimizer.lr) * hvd.size()
        K.set_value(model.optimizer.lr, scaled_lr)

        # Horovod: print summary logs on the first worker.
        verbose = 2 if hvd.rank() == 0 else 0

        callbacks = [
            # Horovod: broadcast initial variable states from rank 0 to all other processes.
            # This is necessary to ensure consistent initialization of all workers when
            # training is started with random weights or restored from a checkpoint.
            hvd.callbacks.BroadcastGlobalVariablesCallback(root_rank=0),

            # Horovod: average metrics among workers at the end of every epoch.
            #
            # Note: This callback must be in the list before the ReduceLROnPlateau,
            # TensorBoard, or other metrics-based callbacks.
            hvd.callbacks.MetricAverageCallback(),

            # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
            # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
            # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
            hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5,
                                                     initial_lr=scaled_lr,
                                                     verbose=verbose),

            # Reduce LR if the metric is not improved for 10 epochs, and stop training
            # if it has not improved for 20 epochs.
            tf.keras.callbacks.ReduceLROnPlateau(monitor='val_exp_rmspe',
                                                 patience=10,
                                                 verbose=verbose),
            tf.keras.callbacks.EarlyStopping(monitor='val_exp_rmspe',
                                             mode='min',
                                             patience=20,
                                             verbose=verbose),
            tf.keras.callbacks.TerminateOnNaN()
        ]

        # Model checkpoint location.
        ckpt_dir = tempfile.mkdtemp()
        ckpt_file = os.path.join(ckpt_dir, 'checkpoint.h5')
        atexit.register(lambda: shutil.rmtree(ckpt_dir))

        # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.
        if hvd.rank() == 0:
            callbacks.append(
                tf.keras.callbacks.ModelCheckpoint(ckpt_file,
                                                   monitor='val_exp_rmspe',
                                                   mode='min',
                                                   save_best_only=True))

        # Make Petastorm readers.
        with make_batch_reader(
                '%s/train_df.parquet' % args.data_dir,
                num_epochs=None,
                cur_shard=hvd.rank(),
                shard_count=hvd.size(),
                hdfs_driver=PETASTORM_HDFS_DRIVER) as train_reader:
            with make_batch_reader(
                    '%s/val_df.parquet' % args.data_dir,
                    num_epochs=None,
                    cur_shard=hvd.rank(),
                    shard_count=hvd.size(),
                    hdfs_driver=PETASTORM_HDFS_DRIVER) as val_reader:
                # Convert readers to tf.data.Dataset.
                train_ds = make_petastorm_dataset(train_reader) \
                    .apply(tf.data.experimental.unbatch()) \
                    .shuffle(int(train_rows / hvd.size())) \
                    .batch(args.batch_size) \
                    .map(lambda x: (tuple(getattr(x, col) for col in all_cols), tf.log(x.Sales)))

                val_ds = make_petastorm_dataset(val_reader) \
                    .apply(tf.data.experimental.unbatch()) \
                    .batch(args.batch_size) \
                    .map(lambda x: (tuple(getattr(x, col) for col in all_cols), tf.log(x.Sales)))

                history = model.fit(
                    train_ds,
                    validation_data=val_ds,
                    steps_per_epoch=int(train_rows / args.batch_size /
                                        hvd.size()),
                    validation_steps=int(val_rows / args.batch_size /
                                         hvd.size()),
                    callbacks=callbacks,
                    verbose=verbose,
                    epochs=args.epochs)

        # Dataset API usage currently displays a wall of errors upon termination.
        # This global model registration ensures clean termination.
        # Tracked in https://github.com/tensorflow/tensorflow/issues/24570
        globals()['_DATASET_FINALIZATION_HACK'] = model

        if hvd.rank() == 0:
            with open(ckpt_file, 'rb') as f:
                return history.history, f.read()

예제 #29

0

파일 보기

파일: mnist_hvd.py 프로젝트: FNDaily/amazon-sagemaker-examples

    # Horovod: add Horovod Distributed Optimizer.
    opt = hvd.DistributedOptimizer(opt)

    model.compile(loss=tf.keras.losses.categorical_crossentropy,
                  optimizer=opt,
                  metrics=['accuracy'])

    callbacks = [
        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),
    ]

    # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
    if hvd.rank() == 0:
        callbacks.append(tf.keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))

    model.fit(x_train, y_train,
              batch_size=batch_size,
              callbacks=callbacks,
              epochs=epochs,
              verbose=1,
              validation_data=(x_test, y_test))
    score = model.evaluate(x_test, y_test, verbose=0)

    print('Test loss:', score[0])
    print('Test accuracy:', score[1])

    # Horovod: Save model only on worker 0 (i.e. master)
    if hvd.rank() == 0:

예제 #30

0

파일 보기

파일: 03_inceptionv3-hvd.py 프로젝트: sischei/tensorflow-training

    image = tf.image.resize(image, (224, 224))
    label = tf.cast(features['image/class/label'], tf.int64)
    label = tf.one_hot(label, 1001)
    return image, label


data_dir = '/scratch/snx3000/stud50/imagenet/'
list_of_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir)]

dataset = tf.data.Dataset.list_files(list_of_files)
dataset = dataset.interleave(tf.data.TFRecordDataset,
                             cycle_length=120,
                             block_length=1)
dataset = dataset.map(decode)
dataset = dataset.batch(128)
dataset = dataset.shard(hvd.size(), hvd.rank())

model = tf.keras.applications.InceptionV3(weights=None,
                                          input_shape=(224, 224, 3),
                                          classes=1001)

optimizer = tf.keras.optimizers.SGD(lr=0.01, momentum=0.9)
optimizer = hvd.DistributedOptimizer(optimizer)

model.compile(optimizer=optimizer,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

hvd_callback = hvd.callbacks.BroadcastGlobalVariablesCallback(0)

fit = model.fit(dataset, epochs=1, callbacks=[hvd_callback])