Exemplo n.º 1
0
    def initialize(self):
        # init_op = tf.initialize_all_variables()
        # init_op = tf.global_variables_initializer()
        # sess = tf.Session()
        # sess.run(init_op)

        # Check if GPUs are available
        # if tf.test.is_gpu_available():  # commented out since this test will cause a new session be created
        # allow growth
        # config = tf.compat.v1.ConfigProto()
        # config.gpu_options.per_process_gpu_memory_fraction = 1
        # config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
        # # config.log_device_placement = True  # to log device placement (on which device the operation ran)
        # sess = tf.compat.v1.Session(config=config)
        # tf.compat.v1.keras.backend.set_session(sess)  # set this TensorFlow session as the default session for Keras

        # Create logger
        self.logger = logging.getLogger('DeepGalaxyTrain')
        self.logger.setLevel(self.log_level)
        self.logger.addHandler(logging.FileHandler('train_log.txt'))
        if self.distributed_training is True:
            try:
                import horovod.tensorflow.keras as hvd
                # initialize horovod
                hvd.init()
                self.callbacks.append(
                    hvd.callbacks.BroadcastGlobalVariablesCallback(0))
                self.callbacks.append(hvd.callbacks.MetricAverageCallback())
                # self.callbacks = [hvd.BroadcastGlobalVariablesHook(0)]
                if hvd.rank() == 0:
                    self.logger.info('Parallel training enabled.')
                    self.logger.info(
                        'batch_size = %d, global_batch_size = %d, num_workers = %d\n'
                        % (self.batch_size, self.batch_size * hvd.size(),
                           hvd.size()))

                # Map an MPI process to a GPU (Important!)
                print('hvd_rank = %d, hvd_local_rank = %d' %
                      (hvd.rank(), hvd.local_rank()))
                self.logger.info('hvd_rank = %d, hvd_local_rank = %d' %
                                 (hvd.rank(), hvd.local_rank()))

                # Bind a CUDA device to one MPI process (has no effect if GPUs are not used)
                os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank())

                # # Horovod: pin GPU to be used to process local rank (one GPU per process)
                gpus = tf.config.experimental.list_physical_devices('GPU')
                for gpu in gpus:
                    tf.config.experimental.set_memory_growth(gpu, True)
                # if gpus:
                # tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
            except ImportError as identifier:
                print(
                    'Error importing horovod. Disabling distributed training.')
                self.distributed_training = False
        else:
            self.logger.info('Parallel training disabled.')
            self.logger.info('Batch_size = %d' % (self.batch_size))
Exemplo n.º 2
0
def construct_dataset(filenames,
                      batch_size,
                      n_epochs,
                      sample_shape,
                      rank=0,
                      n_ranks=1,
                      shard=True,
                      shuffle=False,
                      local_fs=False,
                      shuffle_buffer_size=128):
    # Define the dataset from the list of files
    data = tf.data.Dataset.from_tensor_slices(filenames)
    if (shard and local_fs):
        local_rank = int(hvd.local_rank())
        local_size = int(hvd.local_size())
        data = data.shard(num_shards=local_size, index=local_rank)
    elif (shard):
        data = data.shard(num_shards=n_ranks, index=rank)
    if shuffle:
        data = data.shuffle(len(filenames), reshuffle_each_iteration=True)
    # Parse TFRecords
    parse_data = partial(_parse_data, shape=sample_shape)
    data = data.apply(tf.data.TFRecordDataset).map(parse_data,
                                                   num_parallel_calls=4)
    # Localized sample shuffling (note: imperfect global shuffling)
    if shuffle:
        data = data.shuffle(shuffle_buffer_size)
    data = data.repeat(n_epochs)
    data = data.batch(batch_size, drop_remainder=True)
    return data.prefetch(4)
def _get_hooks(is_distributed=_DISTRIBUTED, verbose=1):
    logger = _get_logger()
    if is_distributed:
        logger.info("Rank: {} Cluster Size {}".format(hvd.local_rank(),
                                                      hvd.size()))
        return [
            # Horovod: broadcast initial variable states from rank 0 to all other processes.
            # This is necessary to ensure consistent initialization of all workers when
            # training is started with random weights or restored from a checkpoint.
            hvd.callbacks.BroadcastGlobalVariablesCallback(0),
            # Horovod: average metrics among workers at the end of every epoch.
            #
            # Note: This callback must be in the list before the ReduceLROnPlateau,
            # TensorBoard, or other metrics-based callbacks.
            hvd.callbacks.MetricAverageCallback(),
            # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
            # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
            # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
            hvd.callbacks.LearningRateWarmupCallback(
                warmup_epochs=_WARMUP_EPOCHS, verbose=verbose),
            # Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs.
            hvd.callbacks.LearningRateScheduleCallback(
                start_epoch=_WARMUP_EPOCHS, end_epoch=30, multiplier=1.0),
            hvd.callbacks.LearningRateScheduleCallback(start_epoch=30,
                                                       end_epoch=60,
                                                       multiplier=1e-1),
            hvd.callbacks.LearningRateScheduleCallback(start_epoch=60,
                                                       end_epoch=80,
                                                       multiplier=1e-2),
            hvd.callbacks.LearningRateScheduleCallback(start_epoch=80,
                                                       multiplier=1e-3),
        ]
    else:
        return []
Exemplo n.º 4
0
    def __init__(self, *args, **kwargs):
        super(KerasTests, self).__init__(*args, **kwargs)
        warnings.simplefilter('module')
        hvd.init()

        self.config = tf.ConfigProto()
        self.config.gpu_options.allow_growth = True
        self.config.gpu_options.visible_device_list = str(hvd.local_rank())
Exemplo n.º 5
0
    def init_horovod(self):
        # Horovod: initialize Horovod.
        hvd.init()

        # Horovod: pin GPU to be used to process local rank (one GPU per process)
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        K.set_session(tf.Session(config=config))
def _get_runconfig(is_distributed=_DISTRIBUTED):
    if is_distributed:
        # Horovod: pin GPU to be used to process local rank (one GPU per process)
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(hvd.local_rank())
    else:
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
    return config
Exemplo n.º 7
0
def train_evaluate():

    # Generate training and validation data generators 
    def get_image_list(data_dir):
       dataset = []
       for folder in os.listdir(data_dir):
          for image in os.listdir(os.path.join(data_dir, folder)):
             dataset.append((os.path.join(data_dir, folder, image), folder)) 
       return dataset      

    training_data = ImageSequence(get_image_list(os.path.join(FLAGS.data_dir, 'train')), FLAGS.batch_size, True)
    validation_data = ImageSequence(get_image_list(os.path.join(FLAGS.data_dir, 'test')), FLAGS.batch_size, False)

    # Horovod: Initialize Horovod
    hvd.init()

    # Horvod: Pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    tf.keras.backend.set_session(tf.Session(config=config))

    # Create a model
    model = network_model(FLAGS.hidden_units)
    loss = 'categorical_crossentropy'

    # Horovod: Adjust learning rate based on number of GPUs
    optimizer = Adadelta(lr=1.0 * hvd.size())
    # Horovod: add Horovod Distributed Optimizer
    optimizer = hvd.DistributedOptimizer(optimizer)

    metrics = ['acc']
    model.compile(optimizer, loss, metrics)
  
    # Set up callbacks
    callbacks = [
        # Broadcast initial variable states from rank 0 to all other processes
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),
    ]
    
    # Horovod: save  logs only on worker 0
    if hvd.rank() == 0:
        callbacks.append(tf.keras.callbacks.TensorBoard(log_dir=FLAGS.log_dir))

    # Start training
    model.fit_generator(generator = training_data,
                        validation_data = validation_data,
                        epochs = FLAGS.epochs,
                        use_multiprocessing = True,
                        workers = 4,
                        callbacks = callbacks,
                        verbose = 1)

    # Save the model
    model.save(FLAGS.save_model_path)
Exemplo n.º 8
0
def _main():
    hvd.init()
    better_exceptions.MAX_LENGTH = 128
    _MODELS_DIR.mkdir(parents=True, exist_ok=True)
    logger = tk.log.get()
    logger.addHandler(tk.log.stream_handler())
    if hvd.rank() == 0:
        logger.addHandler(
            tk.log.file_handler(_MODELS_DIR / 'train.log', append=True))
    with tk.dl.session(
            gpu_options={'visible_device_list': str(hvd.local_rank())}):
        _run()
Exemplo n.º 9
0
def init(global_batch_size, max_gpu_batch_size, gpus=runai.utils.gpus.count()):
    if gpus < 1:
        raise ValueError('GPU count (%d) must be at least 1' % gpus)

    module = sys.modules[__name__]

    setattr(module, 'global_batch_size', global_batch_size)
    setattr(module, 'gpus', gpus)
    setattr(module, 'master', True)

    # TODO(levosos): support uneven dividing
    steps = max(1, global_batch_size //
                (max_gpu_batch_size * gpus))  # must be at least 1
    batch_size = global_batch_size // (steps * gpus)

    setattr(module, 'steps', steps)
    setattr(module, 'batch_size', batch_size)

    runai.utils.log.info(
        'Spreading global batch size %d across %d GPU(s) each with %d step(s) of batch size %d',
        global_batch_size, gpus, steps, batch_size)

    if gpus > 1:
        runai.utils.log.debug('Initializing Horovod')
        import horovod.keras as hvd
        hvd.init()
        setattr(module, 'master', hvd.local_rank() == 0)
        setattr(module, 'hvd',
                hvd)  # so that anyone will be easily accessible to Horovod

        runai.utils.log.debug('Attaching Keras session to GPU #%d',
                              hvd.local_rank())
        import tensorflow
        config = tensorflow.ConfigProto()
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        import keras.backend
        keras.backend.set_session(
            tensorflow.Session(config=config)
        )  # TODO(levosos): support cases where configuration will be set afterwards
Exemplo n.º 10
0
def main(args):
    # ===========変更点============= #
    import horovod.keras as hvd
    hvd.init()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.Session(config=config))
    
    logging.info("getting data")
    
    train_dataset = train_input_fn()
    eval_dataset = eval_input_fn()
    validation_dataset = validation_input_fn()
    
    
    # ===============変更点======================= #
    logging.info("configuring model")
    model = keras_model_fn(args.learning_rate, args.weight_decay, args.optimizer, args.momentum, hvd)
    callbacks = []

    # ===============変更点======================= #
    # callbacks.append(ModelCheckpoint(args.model_dir + '/checkpoint-{epoch}.h5'))
    callbacks.append(ModelCheckpoint(args.model_output_dir + '/checkpoint-{epoch}.h5'))
    
    # ===============変更点======================= #
    callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
    callbacks.append(hvd.callbacks.MetricAverageCallback())
    callbacks.append(hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1))
    
    # ===============変更点======================= #
    if hvd.rank() == 0:
        callbacks.append(ModelCheckpoint(args.model_output_dir + '/checkpoint-{epoch}.h5'))
        callbacks.append(TensorBoard(log_dir=args.model_dir,update_freq='epoch'))

    logging.info("Starting training")
    model.fit(x=train_dataset[0], y=train_dataset[1],
              steps_per_epoch=(num_examples_per_epoch('train') // args.batch_size),
              epochs=args.epochs, validation_data=validation_dataset,
              validation_steps=(num_examples_per_epoch('validation') // args.batch_size), callbacks=callbacks)

    score = model.evaluate(eval_dataset[0], eval_dataset[1], steps=num_examples_per_epoch('eval') // args.batch_size,
                           verbose=0)

    logging.info('Test loss:{}'.format(score[0]))
    logging.info('Test accuracy:{}'.format(score[1]))

    # ===============変更点======================= #
    # return save_model(model, args.model_dir)
    return save_model(model, args.model_output_dir)
Exemplo n.º 11
0
def perform_setup(options):

    import numpy as np
    import sys
    import keras
    import keras.backend as K
    import tensorflow as tf

    sys.setrecursionlimit(5000)

    if options.with_hvd:
        import horovod.keras as hvd
        hvd.init()
        config = tf.ConfigProto()
        config.gpu_options.allow_growth=True
        if options.gpu > 1:
            devlist = '0'
            for i in range(1,options.gpu):
                devlist += ','+str(i)
            config.gpu_options.visible_device_list = devlist
        else:
            config.gpu_options.visible_device_list = str(hvd.local_rank())
        K.set_session(tf.Session(config=config))
    config = tf.ConfigProto()
    config.gpu_options.allow_growth=True
    config.gpu_options.per_process_gpu_memory_fraction=0.25
    K.set_session(tf.Session(config=config))


    global _globalnpfile
    global _globalexpectedpixel
    global IMG_DTYPE
    global SEG_DTYPE
    global FLOAT_DTYPE
    global _nx
    global _ny


    # raw dicom data is usually short int (2bytes) datatype
    # labels are usually uchar (1byte)
    IMG_DTYPE = np.int16
    SEG_DTYPE = np.uint8
    FLOAT_DTYPE = np.float32

    # _globalnpfile = options.dbfile.replace('.csv','%d.npy' % options.trainingresample )
    # _globalexpectedpixel=512
    _nx = options.trainingresample
    _ny = options.trainingresample

    return IMG_DTYPE, SEG_DTYPE, _nx, _ny
Exemplo n.º 12
0
def init(global_batch_size, max_gpu_batch_size, gpus=None):
    # first of all calculate the number of GA steps and the batch size
    runai.elastic._init(global_batch_size, max_gpu_batch_size, gpus)

    # now use Horovod if needed
    if runai.elastic.gpus > 1:
        runai.utils.log.debug('Initializing Horovod')
        import horovod.keras as hvd
        hvd.init()

        setattr(runai.elastic, 'master', hvd.local_rank() == 0)
        setattr(runai.elastic, 'hvd',
                hvd)  # so that anyone will be easily accessible to Horovod

        runai.utils.log.debug('Attaching Keras session to GPU #%d',
                              hvd.local_rank())
        import tensorflow
        config = tensorflow.ConfigProto()
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        import keras.backend
        keras.backend.set_session(
            tensorflow.Session(config=config)
        )  # TODO(levosos): support cases where configuration will be set afterwards
Exemplo n.º 13
0
def init_keras(hvd=None):
    """
    Set config for Horovod. Config params copied from official example:
    https://github.com/uber/horovod/blob/master/examples/keras_mnist_advanced.py#L15

    :param hvd: instance of horovod.keras
    """

    init_cuda_env()
    config = tf.ConfigProto()

    if hvd:
        hvd.init()
        config.gpu_options.allow_growth = True
        config.gpu_options.visible_device_list = str(hvd.local_rank())

    set_session(tf.Session(config=config))
Exemplo n.º 14
0
def main(use_horovod: bool, gpus: int, checkpoint: int,
         config_path: str) -> None:
    config = process_config(config_path, use_horovod, gpus, checkpoint)

    # create tensorflow session and set as keras backed
    tf_config = tf.ConfigProto()

    if config.trainer.use_horovod:
        import horovod.keras as hvd

        hvd.init()
        tf_config.gpu_options.allow_growth = True
        tf_config.gpu_options.visible_device_list = str(hvd.local_rank())

    is_master = not config.trainer.use_horovod
    if not is_master:
        import horovod.keras as hvd

        is_master = hvd.rank() == 0

    if is_master and not os.path.exists(config.exp.source_dir):
        # copy source files
        shutil.copytree(
            os.path.abspath(os.path.curdir),
            config.exp.source_dir,
            ignore=lambda src, names:
            {"datasets", "__pycache__", ".git", "experiments", "venv"})

    tf_sess = tf.Session(config=tf_config)
    K.set_session(tf_sess)
    data_loader = get_data_loader(config=config)

    model, trainer = build_model_and_trainer(config, data_loader)

    print(f"Start Training Experiment {config.exp.name}")
    try:
        trainer.train()
    except Exception as e:
        send_noti_to_telegram(
            f"an exception raised on training {config.exp.name}")
        raise e
Exemplo n.º 15
0
def setup_tf_config(config: DotMap):

    tf_config = tf.ConfigProto()

    if config.trainer.use_horovod:
        import horovod.keras as hvd

        hvd.init()
        tf_config.gpu_options.allow_growth = True
        tf_config.gpu_options.visible_device_list = str(hvd.local_rank())

    is_master = not config.trainer.use_horovod
    if not is_master:
        import horovod.keras as hvd

        is_master = hvd.rank() == 0

    tf_sess = tf.Session(config=tf_config)
    K.set_session(tf_sess)

    return is_master
Exemplo n.º 16
0
def get_session(log_device_placement=False, allow_soft_placement=True, debug=False, device_count=None):
  """
  TODO FIXME get_session will casue  at last
#Exception UnboundLocalError: "local variable 'status' referenced before assignment" in <bound method Session.__del__ of <tensorflow.python.client.session.Session object at 0x858af10>> ignored
#TRACE: 03-17 08:22:26:   * 0 [clear]: tag init stat error

global or inside function global sess will cause this but not big problem for convenience just accpet right now
  """
  if not hasattr(get_session, 'sess') or get_session.sess is None:
    if device_count is None:
      config=tf.ConfigProto(allow_soft_placement=allow_soft_placement, 
                            log_device_placement=log_device_placement)
    else:
      config=tf.ConfigProto(allow_soft_placement=allow_soft_placement, 
                            log_device_placement=log_device_placement,
                            device_count=device_count)    
    use_horovod = 'OMPI_COMM_WORLD_RANK' in os.environ
    if use_horovod:
      config.gpu_options.allow_growth = True
      import horovod.keras as hvd
      config.gpu_options.visible_device_list = str(hvd.local_rank())  
      # sess = tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
      #                                          config=config)
    #config.operation_timeout_in_ms=600000
    #NOTICE https://github.com/tensorflow/tensorflow/issues/2130 but 5000 will cause init problem!
    #config.operation_timeout_in_ms=50000   # terminate on long hangs
    #https://github.com/tensorflow/tensorflow/issues/2292 allow_soft_placement=True
    if FLAGS.use_tpu:
      tpu_cluster_resolver = None
      if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
          FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
      get_session.sess = tf.Session(tpu_cluster_resolver)
    else:
      get_session.sess = tf.Session(config=config)
    
    if debug:
      from tensorflow.python import debug as tf_debug
      get_session.sess = tf_debug.LocalCLIDebugWrapperSession(get_session.sess)
  return get_session.sess
Exemplo n.º 17
0
    def create_inception_model(self, number_categories, dense_layer_sizes, dropout_fraction, unfrozen_layers, focal_loss=False):
        hvd.init()
        config = tf.compat.v1.ConfigProto()
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        opt = hvd.DistributedOptimizer(tf.keras.optimizers.Adam(learning_rate=0.001*hvd.size()))
        model = InceptionV3(include_top=False, pooling='avg')
        output = model.outputs[0]

        for layer_size in dense_layer_sizes:
            dense = Dense(layer_size, activation='relu')(output)
            dropout = Dropout(dropout_fraction)(dense)
            output = BatchNormalization()(dropout)
        
        if number_categories == 1:
            output = Dense(1, activation='sigmoid')(output)
        else:
            output = Dense(number_categories, activation='softmax')(output)
        model = Model(inputs=model.inputs, outputs=output)

        for index in range(len(model.layers) - unfrozen_layers):
            model.layers[index].trainable = False

        if number_categories == 1:
            the_metrics = [metrics.binary_accuracy]
            if focal_loss:
                loss = customlosses.focal_binary_crossentropy
            else:
                loss = 'binary_crossentropy'
        else:
            the_metrics = [metrics.categorical_accuracy]
            if focal_loss:
                loss = customlosses.focal_categorical_crossentropy
            else:
                loss = 'categorical_crossentropy'

        model.compile(optimizer=opt, loss=loss, metrics=the_metrics)
        model.save(self.model_filename)
        self.model = model
Exemplo n.º 18
0
def perform_setup(options):

    sys.setrecursionlimit(5000)

    if options.with_hvd:
        import horovod.keras as hvd
        hvd.init()
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        if options.gpu > 1:
            devlist = '0'
            for i in range(1, options.gpu):
                devlist += ',' + str(i)
            config.gpu_options.visible_device_list = devlist
        else:
            config.gpu_options.visible_device_list = str(hvd.local_rank())
        K.set_session(tf.Session(config=config))

    global _globalnpfile
    global _globalexpectedpixel
    global INT_DTYPE
    global SEG_DTYPE
    global _nx
    global _ny

    # raw dicom data is usually short int (2bytes) datatype
    # labels are usually uchar (1byte)
    IMG_DTYPE = np.int16
    SEG_DTYPE = np.uint8

    _globalnpfile = options.dbfile.replace('.csv',
                                           '%d.npy' % options.trainingresample)
    _globalexpectedpixel = 512
    _nx = options.trainingresample
    _ny = options.trainingresample

    return IMG_DTYPE, SEG_DTYPE, _globalnpfile, _globalexpectedpixel, _nx, _ny
def main(args):
    mpi = False
    if 'sourcedir.tar.gz' in args.tensorboard_dir:
        tensorboard_dir = re.sub('source/sourcedir.tar.gz', 'output',
                                 args.tensorboard_dir)
    else:
        tensorboard_dir = args.tensorboard_dir
    logging.info("Writing TensorBoard logs to {}".format(tensorboard_dir))
    if 'sagemaker_mpi_enabled' in args.fw_params:
        if args.fw_params['sagemaker_mpi_enabled']:
            import horovod.keras as hvd
            mpi = True
            # Horovod: initialize Horovod.
            hvd.init()

            # Horovod: pin GPU to be used to process local rank (one GPU per process)
            gpus = tf.config.experimental.list_physical_devices('GPU')
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            if gpus:
                tf.config.experimental.set_visible_devices(
                    gpus[hvd.local_rank()], 'GPU')

    else:
        hvd = None

    train_dataset = train_input_fn(hvd, mpi)

    eval_dataset = eval_input_fn()
    validation_dataset = validation_input_fn(hvd, mpi)

    model = keras_model_fn(args.learning_rate, args.weight_decay,
                           args.optimizer, args.momentum, mpi, hvd)

    callbacks = []
    if mpi:
        callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
        callbacks.append(hvd.callbacks.MetricAverageCallback())
        callbacks.append(
            hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5,
                                                     verbose=1))
        callbacks.append(
            keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1))
        if hvd.rank() == 0:
            callbacks.append(
                ModelCheckpoint(args.output_dir + '/checkpoint-{epoch}.ckpt',
                                save_weights_only=True,
                                verbose=2))
            callbacks.append(
                TensorBoard(log_dir=tensorboard_dir, update_freq='epoch'))
    else:
        callbacks.append(
            keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1))
        callbacks.append(
            ModelCheckpoint(args.output_dir + '/checkpoint-{epoch}.ckpt',
                            save_weights_only=True,
                            verbose=2))
        callbacks.append(
            TensorBoard(log_dir=tensorboard_dir, update_freq='epoch'))
    logging.info("Starting training")
    size = 1
    if mpi:
        size = hvd.size()

    if mpi and hvd.rank() > 0:
        # for horovod training, no validation for non-master nodes (rank > 0)
        model.fit(
            train_dataset,
            steps_per_epoch=(
                (num_examples_per_epoch('train') // args.batch_size) // size),
            epochs=args.epochs,
            validation_data=validation_dataset,
            validation_steps=(
                (num_examples_per_epoch('validation') // args.batch_size) //
                size),
            callbacks=callbacks,
            verbose=2)
    else:
        model.fit(
            train_dataset,
            steps_per_epoch=(
                (num_examples_per_epoch('train') // args.batch_size) // size),
            epochs=args.epochs,
            validation_data=validation_dataset,
            validation_steps=(
                (num_examples_per_epoch('validation') // args.batch_size) //
                size),
            callbacks=callbacks,
            verbose=2)

    if not mpi or (mpi and hvd.rank() == 0):
        score = model.evaluate(eval_dataset,
                               steps=num_examples_per_epoch('eval') //
                               args.batch_size,
                               verbose=2)

        logging.info('Test loss:{}'.format(score[0]))
        logging.info('Test accuracy:{}'.format(score[1]))

    # Horovod: Save model only on worker 0 (i.e. master)
    if mpi:
        if hvd.rank() == 0:
            model.save(args.model_output_dir)
    else:
        model.save(args.model_output_dir)
Exemplo n.º 20
0
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
import math
import tensorflow as tf
import horovod.keras as hvd

# Horovod: initialize Horovod.
hvd.init()

# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
K.set_session(tf.Session(config=config))

batch_size = 128
num_classes = 10

# Horovod: adjust number of epochs based on number of GPUs.
epochs = int(math.ceil(12.0 / hvd.size()))

# Input image dimensions
img_rows, img_cols = 28, 28

# The data, shuffled and split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

if K.image_data_format() == 'channels_first':
Exemplo n.º 21
0
from __future__ import absolute_import

import os
import socket

import keras
import horovod.keras as hvd

from rpv import load_file, build_model, train_model

print('Distributed RPV classifier training')

# Initialize horovod
hvd.init()
print('MPI rank %i, local rank %i, host %s' %
      (hvd.rank(), hvd.local_rank(), socket.gethostname()))

# Data config
n_train = 32000  #412416
n_valid = 16000  #137471
n_test = 16000  #137471
input_dir = '/data0/users/sfarrell/atlas-rpv-images'
#input_dir = '/global/cscratch1/sd/sfarrell/atlas-rpv-images'

# Load the data files
train_file = os.path.join(input_dir, 'train.h5')
valid_file = os.path.join(input_dir, 'val.h5')
test_file = os.path.join(input_dir, 'test.h5')
train_input, train_labels, train_weights = load_file(train_file, n_train)
valid_input, valid_labels, valid_weights = load_file(valid_file, n_valid)
test_input, test_labels, test_weights = load_file(test_file, n_test)
Exemplo n.º 22
0
def train_hvd(modelCode,model, trainMap,val_df,mode,tf,learning_rate,min_max_scaler,isBinary,old_weights,startSet,startEpoch):
  tensor_board = TensorBoard(log_dir=tfb_log_dir, histogram_freq=1, write_graph=True, write_images=True)
  #isBinary = True
  
  if isBinary:
    classType = "BINARY"
    targetColName = "LABEL1"
  else:
    classType = "MULTI"
    targetColName = "LABEL2"
    
  if mode=="HRV":
    # Horovod: initialize Horovod.
    hvd.init()
    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())

  #   with tf.Graph().as_default():
  #     config = tf.ConfigProto(allow_soft_placement=True)
  #     config.gpu_options.visible_device_list = '0'

    K.set_session(tf.Session(config=config))

    # Horovod: adjust learning rate based on number of GPUs.
    optimizer = keras.optimizers.Adadelta(learning_rate * hvd.size())
    # Horovod: Wrap optimizer with Horovod DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(optimizer)
  
    # Horovod: Broadcast initial variable states from rank 0
    # to all other processes. This is necessary to ensure 
    # consistent initialization of all workers when training is
    # started with random weights or restored from a checkpoint.
    tensor_board = TensorBoard(log_dir=tfb_log_dir, histogram_freq=1, write_graph=True, write_images=True)
    callbacks = [tensor_board, hvd.callbacks.BroadcastGlobalVariablesCallback(0)]
  
  
    #modelNameList = ["buildBinaryClassModel","buildMultipleClassModel","buildMultiAttentionModel"]
    #modelCodeMap = {"LSTM":"buildBinaryClassModel", "BDLSTM":"buildMultipleClassModel","BDLSTM_ATTEN":"buildMultiAttentionModel"}
  else:
    v_optimizer = keras.optimizers.Adam(lr=learning_rate)
    #v_optimizer = keras.optimizers.RMSprop(lr=learning_rate)
    #v_optimizer = keras.optimizers.SGD(lr=learning_rate, clipvalue=1)
    #v_optimizer =  keras.optimizers.SGD(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True)
    callbacks = [tensor_board]
  
  print("Start Train Model ",mode)
  
  cLoop = 1
  #trainedSets = {}
  test_df={}
  resultMetric = {}
  score = 0
  rolling_win_size = 60
  isNotSatisfacEval = True
  nEpochs = 10
  lossOptimal = False
  history =""
  score = 0
  cvscores = []
  curVLoss = 0.001
  resutl = []
  maxBatch = 10
  
  val_seq_array, val_label_array = gen_data_test_val(targetColName, val_df,sequence_length, sequence_cols)
  #val_seq_array, val_label_array, nb_features, nb_out = gen_data_train_val(targetColName, val_df,sequence_length, sequence_cols)
  
  ## Multiple Classifications
  #val_label_array = to_categorical(val_label_array, num_classes=3, dtype='int32')
    
  for nEpoch in range(nEpochs):
    countTrainSet = 1
    trainDataSetKeys = trainMap.keys()
    
    #Hyperparameters
    v_batch_size = 200
    v_validation_split = 0.05
    v_verbose = 2
    
    #verbose: Integer. 0, 1, or 2. Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch
    
    v_LSTMUnitLayer1 = 150
    v_LSTMUnitLayer2 = 60
    v_LSTMUnitLayer3 = 30
    
    v_Dropout = 0.2
    v_maxEpoch = 1
    scores_test = []
    
    for trainKey in trainDataSetKeys:
      if (trainKey>=startSet and nEpoch>=startEpoch):
        if isNotSatisfacEval is True:
          print("Starting Loop (cLoop) : ",str(cLoop))
          print("Train model using dataset {",str(trainKey),"}")
          isTrainSet = True
          train_df_new = getDataFromCSV(sqlContext, dbFSDir,trainMap[trainKey], selectedCols,isTrainSet,isBinary)
          
          ##Correct Sample Labels
          train_df_new = genSampleLabel(train_df_new)
          ##train_df = train_df.append(train_df)
          train_df_new = train_df_new.sort_values(['CODE','YEAR','EVENT_ID','CYCLE'])
          train_df_new = add_features(train_df_new, rolling_win_size , sensor_cols)
          train_df_new = train_df_new.drop(columns=columns_to_drop)
          #train_df_new,min_max_scaler = normalizeMaxMinTrain(train_df_new,min_max_scaler)
          train_df_new = train_df_new.sort_values(['EVENT_ID','CYCLE'])
          train_df_new = train_df_new.drop_duplicates(['EVENT_ID','CYCLE'], keep='last')
          
          printDFPortion(train_df_new, val_df, targetColName)
          seq_array, label_array, nb_features, nb_out = gen_data_train_val(targetColName, train_df_new,sequence_length, sequence_cols)
#           print("Finish Gen Train Data Sequence")
#           print("Finish Gen Validate Data Sequence")
          
          # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
          # Horovod: Save checkpoints only on worker 0 to prevent 
          # other workers from overwriting and corrupting them.
          ###checkpoint_dir = dataLake
          
          if mode=="HRV":
            if hvd.rank() == 0:
              callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_dir+ str(cLoop)+'checkpoint-hvd.hdf5', save_weights_only=True))
          
          original_label_array = label_array
          
          #Multiple Classification
          #label_array = to_categorical(label_array, num_classes=3, dtype='int32')
#          val_label_array = to_categorical(val_label_array, num_classes=3, dtype='int32')
          nb_classes=label_array.shape[1]
          vb_classes=val_label_array.shape[1]          
#           print("label_array : nb_classes: ",nb_classes)
#           print("val_label_array : vb_classes: ",vb_classes)

          if len(old_weights)==0 and classType=="MULTI":
            model.add(Bidirectional(LSTM(units=v_LSTMUnitLayer1, return_sequences=True),input_shape=(sequence_length, nb_features),merge_mode='concat'))
#             print("Created Bidirectional 1")
            model.add(Dropout(v_Dropout))
            model.add(Bidirectional(LSTM(units=v_LSTMUnitLayer2,return_sequences=True)))
#             print("Created Bidirectional 2")
            model.add(Dropout(v_Dropout))
            model.add(Bidirectional(LSTM(units=v_LSTMUnitLayer3,return_sequences=False)))
#             print("Created Bidirectional 3")
            model.add(Dropout(v_Dropout))
            model.add(Dense(units=nb_classes,activation='softmax'))
          elif len(old_weights)==0 and classType=="BINARY":
            model.add(Bidirectional(LSTM(units=v_LSTMUnitLayer1, return_sequences=True),input_shape=(sequence_length, nb_features),merge_mode='concat'))
            model.add(Dropout(v_Dropout))
#             print("Created Bidirectional 1")
            model.add(Bidirectional(LSTM(units=v_LSTMUnitLayer2,return_sequences=False)))
            model.add(Dropout(v_Dropout))
#             print("Created Bidirectional 2")
            model.add(Dense(units=nb_out, activation='sigmoid'))
            print("nb_out:",nb_out)
          else:
            print("Model Already Constructed.")
          try:
            
            if old_weights!="":
              model.set_weights(old_weights)
              print("Reset weights successfully.")
              
          except:
            print("Failed reset weights.")
            pass
          
#           try:
#             model = multi_gpu_model(model,gpus=2)
#             print("Training using multiple GPUs..")
#           except:
#             print("Training using single GPU or CPU..")
#             pass
          
          if nb_classes>2:
            model.compile(loss='categorical_crossentropy', optimizer=v_optimizer, metrics=['accuracy'])
            print("set loss: categorical_crossentropy ")
          else:
            model.compile(loss='binary_crossentropy', optimizer=v_optimizer, metrics=['accuracy'])
            print("set loss: binary_crossentropy ")
            
          print(model.summary())
          
          processCode = str(cLoop)+"_R_"+str(trainKey)
          
          if mode=="HRV":
            if hvd.rank() == 0:
                callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_dir + '/'+processCode+'_checkpoint-{epoch}.h5'))
          
          ### Utilizing Horovod
          history = model.fit(seq_array, label_array,
                              batch_size=v_batch_size,
                              epochs=v_maxEpoch, 
                              verbose=2,
                  #validation_data=(val_seq_array, val_label_array),
                  validation_split=v_validation_split,
                  callbacks = callbacks)
          
              
          try:
            old_weights = model.get_weights()
            # evaluate the model
          except:
            print("Error get_weights !")
          
          # list all data in history
          print(history.history.keys())
          
            
          #val_seq_array, val_label_array = gen_data_test_val(targetColName, val_df,sequence_length, sequence_cols)
          #val_label_array = to_categorical(val_label_array, num_classes=3, dtype='int32')
          # cm,precision_test,recall_test,f1_test, y_true_label, y_predicted = evaluationMetrics(val_seq_array,val_label_array,isBinary,model)
          
          #printProb(model,val_seq_array , val_label_array)
          
          try:
            
            #cm,precision_test,recall_test,f1_test, y_true_label, y_predicted, y_pred_prop, y_pred_prob_thrldeshod = evaluationMetrics(val_seq_array,val_label_array,isBinary,model)
            cm,precision_test,recall_test,f1_test, y_true_label, y_pred_class, y_pred_prop, y_pred_prob_threshold = evaluationMetrics(val_seq_array,val_label_array,isBinary,model)
            
          except:
            precision_test = 0
            recall_test = 0
            f1_test=0
            print("Error in evaluation performance [evaluationMetrics]!")
            #return model
            pass
          
          if len(old_weights)==0:
              print("Error Empty Weights!!")
          else:
              print("Has weights!!")
          
          if mode!="HRV":
            try:
              currentModelPath = processCode + "_"+model_path
              print("Trying to save model : "+currentModelPath)
              model.save(currentModelPath)

              try:
                fromPath = "file:/databricks/driver/"+currentModelPath
                print("Copying file [",fromPath,"] to Data Lake....")
                copyData(fromPath, dataLake+"/model",False)
                
              except:
                print("Error while trying to transfer file "+"file:/databricks/driver/"+currentModelPath," to ",dataLake+"/model")
                pass  
                print("Model Saved >> ",currentModelPath)
            
            except:
              print("Error Saving Model",currentModelPath)
              pass
          
          try:
            lossOptimal, score, result, curVLoss = isOptimal(history,countTrainSet,score,curVLoss,nEpoch)
            #resultMetric[cLoop] = [cLoop, processCode] + result
            resultMetric[cLoop] = [cLoop, processCode] + result + [precision_test,recall_test,f1_test]
            print(resultMetric)
            saveFileToDataLake(resultMetric)
          except:
            print("Erro write metric file.")
            pass
            
          if lossOptimal is False:
            countTrainSet=countTrainSet+1
          else:
            break
          cLoop = cLoop+1
        else:
          print("Skip DataSet:",trainKey)
      else:
        print("Train and evaluation is satisfactory!")
        break
  return model
Exemplo n.º 23
0
    #import tensorflow as tf
    #config = tf.ConfigProto(log_device_placement=True)

    from EcalEnergyGan import generator, discriminator

    import tensorflow as tf
    import horovod.keras as hvd

    # Initialize Horovod.
    hvd.init()

    # Pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    import time
    time.sleep(10 * hvd.local_rank())
    import setGPU
    #config.gpu_options.visible_device_list = str(hvd.local_rank())
    tf.Session(config=config)

    g_weights = 'params_generator_epoch_'
    d_weights = 'params_discriminator_epoch_'

    nb_epochs = 25
    batch_size = 128
    latent_size = 200
    verbose = 'false'
    nb_classes = 2

    generator = generator(latent_size)
    discriminator = discriminator()
Exemplo n.º 24
0
            resume_training = False
        elif opt in ('-l', '--length'):
            predict_length = int(arg)
        elif opt in ('-m', '--mgpu'):
            multi_gpu = True
        elif opt in ('-e', '--epoch'):
            epoch = int(arg)

    if multi_gpu:
        import tensorflow as tf
        import horovod.keras as hvd

        hvd.init()
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        print('hdv.local_rank: ', hvd.local_rank())
        config.gpu_options.visible_device_list = str(hvd.local_rank())
        K.set_session(tf.Session(config=config))

"""
Keras2 based WaveNet

Based originally on Bas Veeling's implementation at: https://github.com/basveeling/wavenet/
which is (c) Bas Veeling

Copyright (c) MUNICH ARTIFICIAL INTELLIGENCE LABORATORIES GmbH. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
def main(args):    
    #initialize Horovod.
    hvd.init()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.Session(config=config))
    
    fold = args.data_path.split("fold_")[1]
    if hvd.rank()==0:
        print("================================")
        if args.use_lovasz:
            print("Fine tuning with ")
        print("Fold {}".format(fold))
        
    #Find best saved model
    best_model_file = 'weights/{}/fold_{}_{epoch}_best.h5'.format(args.model, fold, epoch='{epoch}')
    resume_from_epoch = 0
    for try_epoch in range(args.epochs, 0, -1):
        if os.path.exists(best_model_file.format(epoch=try_epoch)):
            resume_from_epoch = try_epoch
            break
    if hvd.rank()==0:
        print("Last model saved: {}".format(best_model_file.format(epoch=resume_from_epoch)))
    resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch')
    #verbose mode for one node
    if hvd.rank()==0:
        verbose = 1
    else:
        verbose = 0
   
    #Create dataset
    
    dataset = TGSDataset(data_path=args.data_path, batch_size=args.batch_size)
    input_shape = (args.target_size, args.target_size)
    mask_shape = (101, 101)
    train_data_generator = dataset.get_train_data_generator(input_size=input_shape, mask_size=mask_shape, seed=np.random.rand())
    val_data_generator = dataset.get_val_data_generator(input_size=input_shape, mask_size=mask_shape, seed=np.random.rand())
    train_step_size = dataset.train_step_size // hvd.size()
    val_step_size = dataset.val_step_size // hvd.size()
    #Create model
    model = make_model(args.model, (args.target_size, args.target_size, 3), 2)

    #load weights
    if resume_from_epoch > 0:
        model.load_weights(best_model_file.format(epoch=resume_from_epoch))
        
    size = hvd.size()
    opt = hvd.DistributedOptimizer(SGD(lr=args.learning_rate * size, momentum=0.9, nesterov=True))

    #Loss
    loss = losses.c_lovasz_loss if args.use_lovasz else losses.c_binary_crossentropy
    
    model.compile(loss=loss,
                  optimizer=opt,
                  metrics=[metrics.c_binary_accuracy, metrics.c_iou])

    #h5 model
    best_model = ModelCheckpointMGPU(model, filepath=best_model_file, monitor='val_loss',
                                     verbose=1,
                                     mode='min',
                                     period=1,
                                     save_best_only=True,
                                     save_weights_only=True)
    callbacks = [
        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),

        # Horovod: average metrics among workers at the end of every epoch.
        #
        # Note: This callback must be in the list before the ReduceLROnPlateau,
        # TensorBoard, or other metrics-based callbacks.
        hvd.callbacks.MetricAverageCallback(),

        # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
        # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
        # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
        hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=args.warmup_epochs, verbose=True)
    ]

    # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.
    if hvd.rank() == 0:
        callbacks.append(keras.callbacks.TensorBoard(args.log_dir))
        callbacks.append(best_model)
    
    #Fit model
    history = model.fit_generator(train_data_generator,
                        steps_per_epoch=train_step_size,
                        callbacks=callbacks,
                        epochs=args.epochs,
                        verbose=verbose,
                        workers=4,
                        initial_epoch=resume_from_epoch,
                        validation_data=val_data_generator,
                        validation_steps=val_step_size)
  

    score = hvd.allreduce(model.evaluate_generator(val_data_generator, val_step_size, workers=4))
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])
Exemplo n.º 26
0
def main(args):
    if 'sourcedir.tar.gz' in args.tensorboard_dir:
        tensorboard_dir = re.sub('source/sourcedir.tar.gz', 'model',
                                 args.tensorboard_dir)
    else:
        tensorboard_dir = args.tensorboard_dir
    logging.info("Writing TensorBoard logs to {}".format(tensorboard_dir))

    if os.path.isdir(args.checkpoint_path):
        logging.info("Checkpointing directory {} exists".format(
            args.checkpoint_path))
    else:
        logging.info("Creating Checkpointing directory {}".format(
            args.checkpoint_path))
        os.mkdir(args.checkpoint_path)

    mpi = False
    if 'sagemaker_mpi_enabled' in args.fw_params:
        if args.fw_params['sagemaker_mpi_enabled']:
            import horovod.keras as hvd
            mpi = True
            # Horovod: initialize Horovod.
            hvd.init()

            # Horovod: pin GPU to be used to process local rank (one GPU per process)
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            config.gpu_options.visible_device_list = str(hvd.local_rank())
            K.set_session(tf.Session(config=config))
    else:
        hvd = None
    logging.info("Running with MPI={}".format(mpi))

    logging.info("getting data")
    train_dataset = train_input_fn()
    eval_dataset = eval_input_fn()
    validation_dataset = validation_input_fn()

    logging.info("configuring model")

    # Load model
    if not os.listdir(args.checkpoint_path):
        model = keras_model_fn(args.learning_rate, args.weight_decay,
                               args.optimizer, args.momentum, mpi, hvd)
        epoch_number = 0
    else:
        model, epoch_number = load_checkpoint_model(args.checkpoint_path)

    logging.info("Checkpointing to: {}".format(args.checkpoint_path))

    callbacks = []
    if mpi:
        callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
        callbacks.append(hvd.callbacks.MetricAverageCallback())
        callbacks.append(
            hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5,
                                                     verbose=1))
        callbacks.append(
            keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1))
        if hvd.rank() == 0:
            callbacks.append(
                ModelCheckpoint(args.checkpoint_path +
                                '/checkpoint-{epoch}.h5'))
            callbacks.append(
                TensorBoard(log_dir=tensorboard_dir, update_freq='epoch'))
    else:
        callbacks.append(
            keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1))
        callbacks.append(
            ModelCheckpoint(args.checkpoint_path + '/checkpoint-{epoch}.h5'))
        callbacks.append(
            TensorBoard(log_dir=tensorboard_dir, update_freq='epoch'))

    logging.info("Starting training")
    size = 1
    if mpi:
        size = hvd.size()

    model.fit(
        x=train_dataset[0],
        y=train_dataset[1],
        steps_per_epoch=(num_examples_per_epoch('train') // args.batch_size) //
        size,
        epochs=args.epochs,
        initial_epoch=epoch_number,
        validation_data=validation_dataset,
        validation_steps=(num_examples_per_epoch('validation') //
                          args.batch_size) // size,
        callbacks=callbacks)

    score = model.evaluate(eval_dataset[0],
                           eval_dataset[1],
                           steps=num_examples_per_epoch('eval') //
                           args.batch_size,
                           verbose=0)

    logging.info('Test loss:{}'.format(score[0]))
    logging.info('Test accuracy:{}'.format(score[1]))

    # Horovod: Save model only on worker 0 (i.e. master)
    if mpi:
        if hvd.rank() == 0:
            save_model(model, args.model_output_dir)
    else:
        save_model(model, args.model_output_dir)
Exemplo n.º 27
0
def main():
    """Main function"""

    # Initialize horovod
    hvd.init()

    # Parse the command line
    args = parse_args()

    # Setup logging
    log_format = '%(asctime)s %(levelname)s %(message)s'
    logging.basicConfig(level=logging.INFO, format=log_format)
    logging.info('Initializing')
    if args.show_config:
        logging.info('Command line config: %s' % args)

    logging.info('MPI rank %i, local rank %i, host %s' %
                 (hvd.rank(), hvd.local_rank(), socket.gethostname()))

    # Load configuration file
    with open(args.config) as f:
        config = yaml.load(f)
    logging.info('Configuration: %s' % config)

    # Load the data files
    train_data, valid_data, test_data = load_dataset(**config['data_config'])
    train_input, train_labels, train_weights = train_data
    valid_input, valid_labels, valid_weights = valid_data
    test_input, test_labels, test_weights = test_data
    logging.info('train shape: %s Mean label %s' %
                 (train_input.shape, train_labels.mean()))
    logging.info('valid shape: %s Mean label %s' %
                 (valid_input.shape, valid_labels.mean()))
    logging.info('test shape:  %s Mean label %s' %
                 (test_input.shape, test_labels.mean()))

    # Configure the session (e.g. thread settings)
    keras.backend.set_session(configure_session(**config['session_config']))

    # Scale the learning rate
    model_config = config['model_config']
    if model_config.pop('scale_learning_rate'):
        model_config[
            'learning_rate'] = model_config['learning_rate'] * hvd.size()

    # Build the model
    logging.info(config)
    model = build_model(train_input.shape[1:],
                        use_horovod=True,
                        **model_config)
    if hvd.rank() == 0:
        model.summary()

    # Training hooks
    callbacks = []

    # Horovod model synchronization during initialization
    callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))

    # Model checkpointing
    if hvd.rank() == 0:
        checkpoint_file = os.path.expandvars(config['checkpoint_file'])
        os.makedirs(os.path.dirname(checkpoint_file), exist_ok=True)
        callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_file))

    # Batch size
    training_config = config['training_config']
    bsize = training_config['batch_size']
    per_node = training_config.pop('batch_size_per_node')
    training_config['batch_size'] = bsize if per_node else (bsize //
                                                            hvd.size())

    # Run the training
    logging.info('Final training config: %s' % training_config)
    history = model.fit(x=train_input,
                        y=train_labels,
                        validation_data=(valid_input, valid_labels),
                        callbacks=callbacks,
                        verbose=2,
                        **training_config)

    # Evaluate on the test set
    test_loss, test_acc = model.evaluate(test_input, test_labels, verbose=2)
    logging.info('Test loss:     %g' % test_loss)
    logging.info('Test accuracy: %g' % test_acc)

    # Drop to IPython interactive shell
    if args.interactive:
        logging.info('Starting IPython interactive session')
        import IPython
        IPython.embed()

    logging.info('All done!')
Exemplo n.º 28
0
                    help='Training dataset Name.',
                    dest='dataset_type')

args = parser.parse_args()

# Checkpoints will be written in the log directory.
args.checkpoint_format = os.path.join(args.output_path,
                                      'checkpoint-{epoch}.h5')

# Horovod: initialize Horovod.
hvd.init()

# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
K.set_session(tf.Session(config=config))

# If set > 0, will resume training from a given checkpoint.
resume_from_epoch = 0
for try_epoch in range(args.epochs, 0, -1):
    if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)):
        resume_from_epoch = try_epoch
        break

# Horovod: broadcast resume_from_epoch from rank 0 (which will have
# checkpoints) to other ranks.
resume_from_epoch = hvd.broadcast(resume_from_epoch,
                                  0,
                                  name='resume_from_epoch')
Exemplo n.º 29
0
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
import math
import tensorflow as tf
import horovod.keras as hvd

# Initialize Horovod
hvd.init()

# Pin GPU to be used to process local rank (one GPU per process)
gpus = tf.config.experimental.list_physical_devices('GPU')
print(gpus)
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
if gpus:
    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')

batch_size = 128
num_classes = 10

# Horovod: adjust number of epochs based on number of GPUs.
epochs = int(math.ceil(12.0 / hvd.size()))

# input image dimensions
img_rows, img_cols = 28, 28

# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

if K.image_data_format() == 'channels_first':
    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
    def init_callbacks(self) -> None:
        if self.config.trainer.use_lr_decay:
            # linear decay from the half of max_epochs
            def lr_scheduler(lr, epoch, max_epochs):
                return min(lr, 2 * lr * (1 - epoch / max_epochs))

            self.model_callbacks["combined"].append(
                LearningRateScheduler(schedule=lambda epoch: lr_scheduler(self.config.model.generator.lr, epoch,
                                                                          self.config.trainer.num_epochs)))
            for model_name in ['d_x', 'd_y']:
                self.model_callbacks[model_name].append(
                    LearningRateScheduler(schedule=lambda epoch: lr_scheduler(self.config.model.discriminator.lr, epoch,
                                                                              self.config.trainer.num_epochs)))
        # if horovod used, only worker 0 saves checkpoints
        is_master = True
        is_local_master = True
        if self.config.trainer.use_horovod:
            import horovod.keras as hvd

            is_master = hvd.rank() == 0
            is_local_master = hvd.local_rank() == 0

        # horovod callbacks
        if self.config.trainer.use_horovod:
            import horovod.keras as hvd

            self.model_callbacks["combined"].append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
            self.model_callbacks["combined"].append(hvd.callbacks.MetricAverageCallback())
            self.model_callbacks["combined"].append(
                hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1))

        if is_local_master:
            # model saver
            self.model_callbacks["serial_combined"].append(
                ModelCheckpointWithKeepFreq(
                    filepath=os.path.join(self.config.exp.checkpoints_dir, "{epoch:04d}-combined.hdf5"),
                    keep_checkpoint_freq=self.config.trainer.keep_checkpoint_freq,
                    save_checkpoint_freq=self.config.trainer.save_checkpoint_freq,
                    save_best_only=False,
                    save_weights_only=True,
                    verbose=1))

            # save optimizer weights
            for model_name in ['combined', 'd_x', 'd_y']:
                self.model_callbacks[model_name].append(OptimizerSaver(self.config, model_name))
        if is_master:
            # save individual models
            for model_name in ['g_xy', 'g_yx', 'd_x', 'd_y']:
                self.model_callbacks[model_name].append(
                    ModelSaver(
                        checkpoint_dir=self.config.exp.checkpoints_dir,
                        keep_checkpoint_freq=self.config.trainer.keep_checkpoint_freq,
                        model_name=model_name,
                        num_epochs=self.config.trainer.num_epochs,
                        verbose=1))

            # send notification to telegram channel on train start and end
            self.model_callbacks["combined"].append(TrainProgressAlertCallback(experiment_name=self.config.exp.name,
                                                                               total_epochs=self.config.trainer.num_epochs))

            # tensorboard callback
            self.model_callbacks["combined"].append(
                ScalarCollageTensorBoard(log_dir=self.config.exp.tensorboard_dir,
                                         batch_size=self.config.trainer.batch_size,
                                         write_images=True))

        # initialize callbacks by setting model and params
        epochs = self.config.trainer.num_epochs
        steps_per_epoch = self.data_loader.get_train_data_size() // self.config.trainer.batch_size
        for model_name in self.model_callbacks:
            model = eval(f"self.{model_name}")

            callbacks = self.model_callbacks[model_name]
            for callback in callbacks:
                callback.set_model(model)
                callback.set_params({
                    "batch_size": self.config.trainer.batch_size,
                    "epochs": epochs,
                    "steps": steps_per_epoch,
                    "samples": self.data_loader.get_train_data_size(),
                    "verbose": True,
                    "do_validation": False,
                    "model_name": model_name,
                })
Exemplo n.º 31
0
import os

import horovod.keras as hvd
from utils.device import configure_session

distributed = False

rank, n_ranks = 0, 1
if distributed:
    hvd.init()
    rank, n_ranks = hvd.rank(), hvd.size()
    if rank == 0:
        print('rank {}, n_ranks {}'.format(rank, n_ranks))

if n_ranks > 1:
    gpu = hvd.local_rank()
    configure_session(gpu=gpu)

profile_downsample = 2
'''
efit_type='EFITRT1'
input_profile_names = ['thomson_dens_{}'.format(efit_type), 'thomson_temp_{}'.format(efit_type)]
target_profile_names = ['temp', 'dens']
actuator_names = ['pinj', 'curr', 'tinj', 'gasA']
profile_lookback = 1
actuator_lookback = 10
'''

if True:
    processed_filename_base = '/global/cscratch1/sd/abbatej/processed_data/'