def main(args):    
    #initialize Horovod.
    hvd.init()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.Session(config=config))
    
    fold = args.data_path.split("fold_")[1]
    if hvd.rank()==0:
        print("================================")
        if args.use_lovasz:
            print("Fine tuning with ")
        print("Fold {}".format(fold))
        
    #Find best saved model
    best_model_file = 'weights/{}/fold_{}_{epoch}_best.h5'.format(args.model, fold, epoch='{epoch}')
    resume_from_epoch = 0
    for try_epoch in range(args.epochs, 0, -1):
        if os.path.exists(best_model_file.format(epoch=try_epoch)):
            resume_from_epoch = try_epoch
            break
    if hvd.rank()==0:
        print("Last model saved: {}".format(best_model_file.format(epoch=resume_from_epoch)))
    resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch')
    #verbose mode for one node
    if hvd.rank()==0:
        verbose = 1
    else:
        verbose = 0
   
    #Create dataset
    
    dataset = TGSDataset(data_path=args.data_path, batch_size=args.batch_size)
    input_shape = (args.target_size, args.target_size)
    mask_shape = (101, 101)
    train_data_generator = dataset.get_train_data_generator(input_size=input_shape, mask_size=mask_shape, seed=np.random.rand())
    val_data_generator = dataset.get_val_data_generator(input_size=input_shape, mask_size=mask_shape, seed=np.random.rand())
    train_step_size = dataset.train_step_size // hvd.size()
    val_step_size = dataset.val_step_size // hvd.size()
    #Create model
    model = make_model(args.model, (args.target_size, args.target_size, 3), 2)

    #load weights
    if resume_from_epoch > 0:
        model.load_weights(best_model_file.format(epoch=resume_from_epoch))
        
    size = hvd.size()
    opt = hvd.DistributedOptimizer(SGD(lr=args.learning_rate * size, momentum=0.9, nesterov=True))

    #Loss
    loss = losses.c_lovasz_loss if args.use_lovasz else losses.c_binary_crossentropy
    
    model.compile(loss=loss,
                  optimizer=opt,
                  metrics=[metrics.c_binary_accuracy, metrics.c_iou])

    #h5 model
    best_model = ModelCheckpointMGPU(model, filepath=best_model_file, monitor='val_loss',
                                     verbose=1,
                                     mode='min',
                                     period=1,
                                     save_best_only=True,
                                     save_weights_only=True)
    callbacks = [
        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),

        # Horovod: average metrics among workers at the end of every epoch.
        #
        # Note: This callback must be in the list before the ReduceLROnPlateau,
        # TensorBoard, or other metrics-based callbacks.
        hvd.callbacks.MetricAverageCallback(),

        # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
        # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
        # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
        hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=args.warmup_epochs, verbose=True)
    ]

    # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.
    if hvd.rank() == 0:
        callbacks.append(keras.callbacks.TensorBoard(args.log_dir))
        callbacks.append(best_model)
    
    #Fit model
    history = model.fit_generator(train_data_generator,
                        steps_per_epoch=train_step_size,
                        callbacks=callbacks,
                        epochs=args.epochs,
                        verbose=verbose,
                        workers=4,
                        initial_epoch=resume_from_epoch,
                        validation_data=val_data_generator,
                        validation_steps=val_step_size)
  

    score = hvd.allreduce(model.evaluate_generator(val_data_generator, val_step_size, workers=4))
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])
示例#2
0
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
K.set_session(tf.Session(config=config))

# If set > 0, will resume training from a given checkpoint.
resume_from_epoch = 0
for try_epoch in range(args.epochs, 0, -1):
    if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)):
        resume_from_epoch = try_epoch
        break

# Horovod: broadcast resume_from_epoch from rank 0 (which will have
# checkpoints) to other ranks.
resume_from_epoch = hvd.broadcast(resume_from_epoch,
                                  0,
                                  name='resume_from_epoch')

# Horovod: print logs on the first worker.
verbose = 1 if hvd.rank() == 0 else 0

# Input image dimensions
img_rows, img_cols = 28, 28
num_classes = 10

# Load Fashion MNIST data.
(x_train, y_train), (x_test, y_test) = load_data(args.dataset_path)

if K.image_data_format() == 'channels_first':
    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
def main():
    verbose = 1
    logger = _get_logger()
    if _DISTRIBUTED:
        # Horovod: initialize Horovod.
        hvd.init()
        logger.info("Runnin Distributed")
        verbose = 1 if hvd.rank() == 0 else 0

    logger.info("Tensorflow version {}".format(tf.__version__))
    K.set_session(tf.Session(config=_get_runconfig()))

    # Horovod: broadcast resume_from_epoch from rank 0 (which will have
    # checkpoints) to other ranks.
    resume_from_epoch = 0
    if _DISTRIBUTED:
        resume_from_epoch = hvd.broadcast(resume_from_epoch,
                                          0,
                                          name="resume_from_epoch")

    if _FAKE:
        train_iter = _fake_data_iterator_from()
    else:
        train_iter = _training_data_iterator_from()
        test_iter = _validation_data_iterator_from() if _VALIDATION else None

    model = _create_model()

    params = {"learning_rate": _LR, "momentum": 0.9}

    opt = _get_optimizer(params)
    model.compile(
        loss=keras.losses.categorical_crossentropy,
        optimizer=opt,
        metrics=["accuracy", "top_k_categorical_accuracy"],
    )

    model_dir = _get_model_dir()
    checkpoint_format = os.path.join(model_dir, "checkpoint-{epoch}.h5")

    callbacks = _get_hooks()
    callbacks.append(LoggerCallback(logger, len(train_iter) * _BATCHSIZE))

    # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.
    if _is_master():
        callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_format))
        # callbacks.append(keras.callbacks.TensorBoard(log_dir))

    # Restore from a previous checkpoint, if initial_epoch is specified.
    # Horovod: restore on the first worker which will broadcast weights to other workers.
    if resume_from_epoch > 0 and _is_master():
        model.load_weights(checkpoint_format.format(epoch=resume_from_epoch))

    logger.info("Training...")
    # Train the model. The training will randomly sample 1 / N batches of training data and
    # 3 / N batches of validation data on every worker, where N is the number of workers.
    # Over-sampling of validation data helps to increase probability that every validation
    # example will be evaluated.
    num_workers = hvd.size() if _DISTRIBUTED else 1
    model.fit_generator(
        train_iter,
        steps_per_epoch=len(train_iter) // num_workers,
        callbacks=callbacks,
        epochs=_EPOCHS,
        verbose=verbose,
        workers=_NUM_WORKERS,
        max_queue_size=_MAX_QUEUE_SIZE,
        use_multiprocessing=_MULTIPROCESSING,
        initial_epoch=resume_from_epoch,
    )

    if _FAKE is False and _VALIDATION:
        # Evaluate the model on the full data set.
        with Timer(output=logger.info, prefix="Testing"):
            logger.info("Testing...")
            score = hvd.allreduce(
                model.evaluate_generator(test_iter, len(test_iter),
                                         workers=10))
            if verbose:
                print("Test loss:", score[0])
            print("Test accuracy:", score[1])
def main(lossfunction="tversky", lossrate=1e-4, depth=7, optimizer="rms", n_filters=32, fixed=False, resnet=False, bands=[0,1,2,3,4,5], batchnorm=True, dropout=False, dropout_rate=0.10, noise=False, noise_rate=0.1, ramp=False, earlystop=False):

  verbose = 0
  if not useHorovod:
    verbose = 1
  elif hvd.rank() == 0:
    verbose = 2
    
  if verbose > 0:
    logger.info("using bands %s", bands)

  train_file = '../../data/train'
  val_file = '../../data/val'
  test_file = '../../data/test'

  #basepath = '/scratch2/BMC/gsd-hpcs/Jebb.Q.Stewart/git/gsd-machine-learning/src/cwb/ci/data/'
  #train_file = basepath + '/cwbci_512_l30_7bands_radar_train'
  #val_file = basepath + '/cwbci_512_l30_7bands_radar_val'
  #test_file = basepath + '/cwbci_512_l30_7bands_radar_test'

  if verbose > 0:
     logger.info('reading in train data')

  x_train, y_train = readData(train_file)
  x_val, y_val = readData(val_file)
  x_test, y_test = readData(test_file)

  #sample = np.min(100, x_train.shape[0])
  sample = 100
  if verbose > 0:
     logger.info("Sample data: ")
     logger.info("  Training input : max[0]: %s", np.max(x_train[sample,:,:,0]))
     logger.info("                   min[0]: %s", np.min(x_train[sample,:,:,0]))
     logger.info("           shape : %s", x_train.shape)
     logger.info("  Train labels: max[0]: %s min[1]: %s  dtype: %s", np.max(y_train[sample,:,:,:]), np.min(y_train[sample,:,:,:]), y_train.dtype)
     logger.info("  using loss function %s", lossfunction)


  loss = tversky_loss(alpha=0.3, beta=0.7)
  if lossfunction == "dice":
     loss = dice_loss
  elif lossfunction == "tversky2":
     loss = tversky_loss2(alpha=0.7, beta=0.3)
  elif lossfunction == "tversky3":
     loss = tversky_loss2(alpha=0.2, beta=0.8)
  elif lossfunction == "bcedice":
     loss = bce_dice_loss
  elif lossfunction == "focal":
     loss = focal_loss3(gamma=2)
  elif lossfunction == "bce":
     loss = 'binary_crossentropy'
  elif lossfunction == "mse":
     loss = 'mse'
  elif lossfunction == "rmse":
     loss = 'rmse'


  channels = len(bands)*3
  model = unet(img_rows=512, img_cols=512, channels=channels, output_channels=1, fixed=fixed, 
               batchnorm=batchnorm, resnet=resnet, n_filters=n_filters, depth=depth, 
               dropout=dropout, dropout_rate=dropout_rate, noise=noise, 
               noise_rate=noise_rate, final_activation='sigmoid', verbose=verbose)

  if useHorovod:
    opt = hvd.DistributedOptimizer(RMSprop(lr=lossrate*hvd.size()))
    if optimizer == "adam":
      opt = hvd.DistributedOptimizer(Adam(lr=lossrate*hvd.size()))
  else:
    opt = RMSprop(lr=lossrate)
    if optimizer == "adam":
      opt = Adam(lr=lossrate)

  model.compile(optimizer=opt, loss=loss, metrics=[tversky_coeff(alpha=0.3, beta=0.7), dice_coeff, 'accuracy'])

  if verbose > 0:
 
     logger.info("Model Summary:\n%s", model.summary())
     logger.info("Estimated Model GPU usage: %s GB", get_model_memory_usage(batch_size, model))
     logger.info("Current host memory usage: %s", usage());

     # serialize model to JSON
     model_json = model.to_json()
     if not os.path.isdir("models"):
         os.makedirs("models")

     model_file = "models/" + model_name + ".json"
     with open(model_file, "w") as json_file:
        json_file.write(model_json)
     logger.info("saved model to %s", model_file)

  callbacks = []
  if useHorovod:
     callbacks.append( hvd.callbacks.BroadcastGlobalVariablesCallback(0))
     callbacks.append( hvd.callbacks.MetricAverageCallback())
     callbacks.append( hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=verbose))

  if earlystop:
              callbacks.append(EarlyStopping(monitor='val_loss',
                         patience=30,
                         verbose=verbose,
                         min_delta=1e-4,
                         restore_best_weights=True))

  if ramp:
        if useHorovod:
              # Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs.
              callbacks.append(hvd.callbacks.LearningRateScheduleCallback(start_epoch=15, end_epoch=40, multiplier=1.))
              callbacks.append(hvd.callbacks.LearningRateScheduleCallback(start_epoch=40, end_epoch=70, multiplier=1e-1))
              callbacks.append(hvd.callbacks.LearningRateScheduleCallback(start_epoch=70, end_epoch=100, multiplier=1e-2))
              callbacks.append(hvd.callbacks.LearningRateScheduleCallback(start_epoch=100, multiplier=1e-3))

              # Reduce the learning rate if training plateaues.
              #keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1)]
              #ReduceLROnPlateau(monitor='val_loss',
              #               factor=0.1,
              #               patience=4,
              #               verbose=1,
              #               min_delta=1e-4),

  training_bg = generator(x_train, y_train, batch_size, limit=80000, bands=bands)
  val_bg = generator(x_val, y_val, batch_size, limit=40000, bands=bands)
  test_bg = generator(x_test, y_test, batch_size, limit=20000, bands=bands)

  if useHorovod: 
    training_bg.order = hvd.broadcast(training_bg.order, 0, name='training_bg_order').numpy()
    val_bg.order = hvd.broadcast(val_bg.order, 0, name='val_bg_order').numpy()
    test_bg.order = hvd.broadcast(test_bg.order, 0, name='test_bg_order').numpy()

  if verbose > 0:
     logger.info("Training size: %s : steps : %s", training_bg.length, (training_bg.length//batch_size))
     logger.info("Validation size: %s : steps : %s", val_bg.length, (val_bg.length//batch_size))

  # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.
  if not useHorovod or hvd.rank() == 0:
    if not os.path.isdir("checkpoints"):
        os.makedirs("checkpoints")
    callbacks.append(keras.callbacks.ModelCheckpoint('./checkpoints/' + model_name + '_checkpoint-{epoch:02d}-{val_loss:.3f}.hdf5', monitor='val_loss', save_best_only=True, save_weights_only=True))
    #callbacks.append(keras.callbacks.TensorBoard(log_dir='tflogs'))

  size = 1
  if useHorovod:
    size = hvd.size()

  history = model.fit_generator(generator=training_bg, 
         steps_per_epoch=(training_bg.length//batch_size) // size,
         epochs=epochs,
         verbose=verbose,
         callbacks=callbacks,
         validation_data=val_bg,
         validation_steps=(val_bg.length // batch_size) // size,
         shuffle=True,
         use_multiprocessing=False,
         workers=2,
         max_queue_size=8)
  
  if not useHorovod or hvd.rank() == 0:
     # serialize weights to HDF5
     logger.info("saving weights")
     if not os.path.isdir("weights"):
        os.makedirs("weights")

     weights_file = "weights/" + model_name + ".h5"
     model.save_weights(weights_file)
     logger.info("Saved weights to disk %s", weights_file)

     logger.info("evaluating results")

  scores = model.evaluate_generator(generator=test_bg, steps=(test_bg.length//batch_size) // size, workers=2,
         max_queue_size=8, use_multiprocessing=False, verbose=verbose)

  if not useHorovod or hvd.rank() == 0:

    logger.info('Test scores: %s', scores)
    if not os.path.isdir("images"):
        os.makedirs("images")
  
    # plt.xkcd()

    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.savefig("images/" + model_name +"_acc.png")
  
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.savefig("images/" + model_name +"_loss.png")
test_dir = os.path.expanduser('~/imagenet/validation')

# Checkpoint format and log directory.
checkpoint_format = './checkpoint-{epoch}.h5'
log_dir = './logs'

# If set > 0, will resume training from a given checkpoint.
resume_from_epoch = 0
for try_epoch in range(epochs, 0, -1):
    if os.path.exists(checkpoint_format.format(epoch=try_epoch)):
        resume_from_epoch = try_epoch
        break

# Horovod: broadcast resume_from_epoch from rank 0 (which will have
# checkpoints) to other ranks.
resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch')

# Horovod: print logs on the first worker.
verbose = 1 if hvd.rank() == 0 else 0

# Training data iterator.
train_gen = image.ImageDataGenerator(
    width_shift_range=0.33, height_shift_range=0.33, zoom_range=0.5, horizontal_flip=True,
    preprocessing_function=keras.applications.resnet50.preprocess_input)
train_iter = train_gen.flow_from_directory(train_dir, batch_size=batch_size,
                                           target_size=(224, 224))

# Validation data iterator.
test_gen = image.ImageDataGenerator(
    zoom_range=(0.875, 0.875), preprocessing_function=keras.applications.resnet50.preprocess_input)
test_iter = test_gen.flow_from_directory(test_dir, batch_size=batch_size,
示例#6
0
def main():
    # Horovod: initialize Horovod.
    hvd.init()

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
    config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.compat.v1.Session(config=config))

    # If set > 0, will resume training from a given checkpoint.
    resume_from_epoch = 0
    for try_epoch in range(args.epochs, 0, -1):
        if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)):
            resume_from_epoch = try_epoch
            break

    # Horovod: broadcast resume_from_epoch from rank 0 (which will have
    # checkpoints) to other ranks.
    resume_from_epoch = hvd.broadcast(resume_from_epoch,
                                      0,
                                      name='resume_from_epoch')

    # Horovod: print logs on the first worker.
    verbose = 1 if hvd.rank() == 0 else 0

    # Training data iterator.
    train_gen = image.ImageDataGenerator()
    #width_shift_range=0.33, height_shift_range=0.33, zoom_range=0.5, horizontal_flip=True,
    #preprocessing_function=keras.applications.resnet50.preprocess_input)
    train_iter = train_gen.flow_from_directory(args.train,
                                               batch_size=args.batch_size,
                                               target_size=(224, 224))

    # Validation data iterator.
    test_gen = image.ImageDataGenerator()
    #zoom_range=(0.875, 0.875), preprocessing_function=keras.applications.resnet50.preprocess_input)
    test_iter = test_gen.flow_from_directory(args.val,
                                             batch_size=args.val_batch_size,
                                             target_size=(224, 224))

    # train iterator for tfrecord
    train_iter_tf = iterator(args.train_dir)
    val_iter_tf = iterator(args.val_dir)

    # timeline
    #timeline = tf.train.ProfilerHook(save_steps=500, output_dir='./timeline')
    #run_options  = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
    #run_metadata = tf.compat.v1.RunMetadata()

    # Set up standard ResNet-50 model.
    model = keras.applications.resnet50.ResNet50(weights=None)

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

    # Restore from a previous checkpoint, if initial_epoch is specified.
    # Horovod: restore on the first worker which will broadcast both model and optimizer weights
    # to other workers.
    if resume_from_epoch > 0 and hvd.rank() == 0:
        model = hvd.load_model(
            args.checkpoint_format.format(epoch=resume_from_epoch),
            compression=compression)
    else:
        # ResNet-50 model that is included with Keras is optimized for inference.
        # Add L2 weight decay & adjust BN settings.
        model_config = model.get_config()
        for layer, layer_config in zip(model.layers, model_config['layers']):
            if hasattr(layer, 'kernel_regularizer'):
                regularizer = keras.regularizers.l2(args.wd)
                layer_config['config']['kernel_regularizer'] = \
                    {'class_name': regularizer.__class__.__name__,
                     'config': regularizer.get_config()}
            if type(layer) == keras.layers.BatchNormalization:
                layer_config['config']['momentum'] = 0.9
                layer_config['config']['epsilon'] = 1e-5

        model = keras.models.Model.from_config(model_config)

        # Horovod: adjust learning rate based on number of GPUs.
        opt = keras.optimizers.SGD(lr=args.base_lr * hvd.size(),
                                   momentum=args.momentum)

        # Horovod: add Horovod Distributed Optimizer.
        opt = hvd.DistributedOptimizer(opt, compression=compression)

        model.compile(loss=keras.losses.categorical_crossentropy,
                      optimizer=opt,
                      metrics=['accuracy', 'top_k_categorical_accuracy'])
        #              options=run_options,
        #              run_metadata=run_metadata
        #              )

    callbacks = [
        # Horovod: broadcast initial variable states from rank 0 to all other processes.
        # This is necessary to ensure consistent initialization of all workers when
        # training is started with random weights or restored from a checkpoint.
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),

        # Horovod: average metrics among workers at the end of every epoch.
        #
        # Note: This callback must be in the list before the ReduceLROnPlateau,
        # TensorBoard, or other metrics-based callbacks.
        hvd.callbacks.MetricAverageCallback(),

        # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
        # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
        # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
        hvd.callbacks.LearningRateWarmupCallback(
            warmup_epochs=args.warmup_epochs, verbose=verbose),

        # Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs.
        hvd.callbacks.LearningRateScheduleCallback(
            start_epoch=args.warmup_epochs, end_epoch=30, multiplier=1.),
        hvd.callbacks.LearningRateScheduleCallback(start_epoch=30,
                                                   end_epoch=60,
                                                   multiplier=1e-1),
        hvd.callbacks.LearningRateScheduleCallback(start_epoch=60,
                                                   end_epoch=80,
                                                   multiplier=1e-2),
        hvd.callbacks.LearningRateScheduleCallback(start_epoch=80,
                                                   multiplier=1e-3),
    ]

    # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them.
    if hvd.rank() == 0:
        callbacks.append(
            keras.callbacks.ModelCheckpoint(args.checkpoint_format))
        callbacks.append(keras.callbacks.TensorBoard(args.log_dir))

    # Train the model. The training will randomly sample 1 / N batches of training data and
    # 3 / N batches of validation data on every worker, where N is the number of workers.
    # Over-sampling of validation data helps to increase probability that every validation
    # example will be evaluated.

    print('----  train  len------ :', len(train_iter))
    print('----  test   len------ :', len(test_iter))
    total_train_step = len(train_iter)
    total_val_step = len(test_iter)

    #model.fit_generator(train_iter,
    model.fit(
        train_iter_tf,
        #steps_per_epoch=40037 // hvd.size(),
        steps_per_epoch=total_train_step // hvd.size(),
        callbacks=callbacks,
        epochs=args.epochs,
        verbose=verbose,
        workers=8,
        initial_epoch=resume_from_epoch,
        validation_data=val_iter_tf,
        validation_steps=3 * total_val_step // hvd.size())

    # timeline tracing
    #trace = timeline.Timeline(step_stats=run_metadata.step_stats)
    #with open ('./timeline.keras.json','w') as f:
    #     f.write(trace.generate_chrome_trace_format())

    # Evaluate the model on the full data set.
    score = hvd.allreduce(
        model.evaluate_generator(test_iter, len(test_iter), workers=4))
    if verbose:
        print('Test loss:', score[0])
        print('Test accuracy:', score[1])
示例#7
0
def run_cvae(cm_file_train,
             cm_file_val,
             batch_size=32,
             hyper_dim=3,
             epochs=100):
    hvd.init()

    gen_train = CVAEGenerator(cm_file_train,
                              hvd_size=hvd.size(),
                              batch_size=batch_size,
                              shuffle=True)
    gen_val = CVAEGenerator(cm_file_val,
                            hvd_size=hvd.size(),
                            batch_size=batch_size,
                            shuffle=True)
    input_shape = gen_train.get_shape()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.Session(config=config))

    #epochs = int(math.ceil(epochs / hvd.size()))

    #cvae = CVAE(input_shape[1:], hyper_dim, lr=0.001*hvd.size())
    cvae = CVAE(input_shape[1:], hyper_dim, lr=0.001)
    cvae.optimizer = hvd.DistributedOptimizer(cvae.optimizer)
    cvae.model.compile(optimizer=cvae.optimizer, loss=cvae._vae_loss)

    model_weight = 'cvae_weight-{epoch}.h5'
    model_file = 'cvae_model-{epoch}.h5'
    loss_file = 'loss.npz'

    resume_from_epoch = 0
    for try_epoch in range(epochs, 0, -1):
        if os.path.exists(model_weight.format(epoch=try_epoch)):
            resume_from_epoch = try_epoch
            break
    resume_from_epoch = hvd.broadcast(resume_from_epoch,
                                      0,
                                      name='resume_from_epoch')

    if resume_from_epoch > 0:
        cvae.model.load_weights(model_weight.format(epoch=resume_from_epoch))

    callbacks = [
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),
    ]
    #callbacks.append(lms_callback)
    if hvd.rank() == 0:
        callbacks.append(cvae.history)
        #callbacks.append(keras.callbacks.TensorBoard('./logs'))
        #callbacks.append(keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))

#     callback = EmbeddingCallback(cm_data_train, cvae)
    cvae.train(gen_train,
               validation_data=gen_val,
               batch_size=batch_size,
               epochs=epochs,
               initial_epoch=resume_from_epoch,
               callbacks=callbacks)
    if hvd.rank() == 0:
        cvae.model.save_weights(model_weight.format(epoch=epochs))
        cvae.save(model_file.format(epoch=epochs))
        losses = {'loss': [], 'val_loss': []}
        if resume_from_epoch > 0:
            losses = np.load(loss_file)
        train_losses = np.concatenate([losses['loss'], cvae.history.losses])
        val_losses = np.concatenate(
            [losses['val_loss'], cvae.history.val_losses])
        np.savez(loss_file, loss=train_losses, val_loss=val_losses)

    return cvae
def main():
    parser = argparse.ArgumentParser(
        description='Keras Fashion MNIST Example',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--log-dir', default='./logs',
                        help='tensorboard log directory')
    parser.add_argument('--batch-size', type=int, default=32,
                        help='input batch size for training')
    parser.add_argument('--val-batch-size', type=int, default=32,
                        help='input batch size for validation')
    parser.add_argument('--epochs', type=int, default=40,
                        help='number of epochs to train')
    parser.add_argument('--base-lr', type=float, default=0.01,
                        help='learning rate for a single GPU')
    parser.add_argument('--momentum', type=float, default=0.9,
                        help='SGD momentum')
    parser.add_argument('--wd', type=float, default=0.000005,
                        help='weight decay')
    # TODO: Step 9 part 1: register `--warmup-epochs`
    parser.add_argument('--warmup-epochs', type=float, default=5,
                        help='number of warmup epochs')

    GRAPHDEF_FILE = 'graphdef'
    parser.add_argument(
        '--savegraph', action='store', nargs='?',
        const=GRAPHDEF_FILE,
        help='Save graphdef pb and pbtxt files. '
        '(default: {})'.format(GRAPHDEF_FILE))

    parser.add_argument(
        '--profrun', action='store_true',
        help='Run for nsys/dlprof profiling. Runs only a few steps.')

    args = parser.parse_args()

    # Checkpoints will be written in the log directory.
    args.checkpoint_format = \
        os.path.join(args.log_dir, 'checkpoint-{epoch}.h5')

    print('AMP MIXED', os.environ.get("TF_ENABLE_AUTO_MIXED_PRECISION"))

    # TODO: Step 2 work here: initialize horovod
    hvd.init()

    # TODO: Step 3 work here: pin GPUs
    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.visible_device_list = str(hvd.local_rank())
    K.set_session(tf.Session(config=config))

    # If set > 0, will resume training from a given checkpoint.
    resume_from_epoch = 0
    for try_epoch in range(args.epochs, 0, -1):
        if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)):
            resume_from_epoch = try_epoch
            break

    # TODO: Step 4 work here: broadcast `resume_from_epoch` from first process
    # to all others
    with tf.Session(config=config):
        resume_from_epoch = \
            hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch')

    # TODO: Step 5 work here: only set `verbose` to `1` if this is the
    # first worker
    verbose = 1 if hvd.rank() == 0 else 0

    # Input image dimensions
    img_rows, img_cols = 28, 28
    num_classes = 10

    # Download and load FASHION MNIST dataset.
    if hvd.rank() == 0:
        # Load Fashion MNIST data.
        (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

    with tf.Session(config=config):
        # download/unzip in rank 0 only.
        hvd.allreduce([0], name="Barrier")

    if hvd.rank() != 0:
        # Load Fashion MNIST data.
        (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

    if K.image_data_format() == 'channels_first':
        x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
        x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
        input_shape = (1, img_rows, img_cols)
    else:
        x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
        x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
        input_shape = (img_rows, img_cols, 1)

    # Convert class vectors to binary class matrices
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)

    # Training data iterator.
    train_gen = image.ImageDataGenerator(
        featurewise_center=True, featurewise_std_normalization=True,
        horizontal_flip=True, width_shift_range=0.2, height_shift_range=0.2)
    train_gen.fit(x_train)
    train_iter = train_gen.flow(x_train, y_train, batch_size=args.batch_size)

    # Validation data iterator.
    test_gen = image.ImageDataGenerator(
        featurewise_center=True, featurewise_std_normalization=True)
    test_gen.mean = train_gen.mean
    test_gen.std = train_gen.std
    test_iter = test_gen.flow(x_test, y_test, batch_size=args.val_batch_size)

    base_lr = args.base_lr
    LR = base_lr * hvd.size()

    # Restore from a previous checkpoint, if initial_epoch is specified.
    # if resume_from_epoch > 0 and hvd.rank() == 0:
    if resume_from_epoch > 0:
        # TODO: Step 6 work here: only execute the `if` statement if this is
        # the first worker
        # If this is only done in rank 0 get following errors:
        #     horovod/common/operations.cc:764] One or more tensors were
        #     submitted to be reduced, gathered or broadcasted by subset of
        #     ranks and are waiting for remainder of ranks
        model = keras.models.load_model(
            args.checkpoint_format.format(epoch=resume_from_epoch))
    else:
        # Set up standard WideResNet-16-10 model.
        model = WideResidualNetwork(
            depth=16, width=10, weights=None, input_shape=input_shape,
            classes=num_classes, dropout_rate=0.01)

        # WideResNet model that is included with Keras is optimized for
        # inference. Add L2 weight decay & adjust BN settings.
        model_config = model.get_config()
        for layer, layer_config in zip(model.layers, model_config['layers']):
            if hasattr(layer, 'kernel_regularizer'):
                regularizer = keras.regularizers.l2(args.wd)
                layer_config['config']['kernel_regularizer'] = \
                    {'class_name': regularizer.__class__.__name__,
                     'config': regularizer.get_config()}
            if type(layer) == keras.layers.BatchNormalization:
                layer_config['config']['momentum'] = 0.9
                layer_config['config']['epsilon'] = 1e-5

        model = keras.models.Model.from_config(model_config)

        # TODO: Step 7 part 1 work here: increase the base learning rate by the
        # number of workers
        opt = keras.optimizers.SGD(
            lr=LR, momentum=args.momentum)

        # TODO: Step 7 part 2 work here: Wrap the optimizer in a Horovod
        # distributed optimizer
        opt_dist = hvd.DistributedOptimizer(opt)

        model.compile(loss=keras.losses.categorical_crossentropy,
                      optimizer=opt_dist,
                      metrics=['accuracy'])

    def lr_schedule(epoch):
        # global LR
        if epoch < 15:
            return LR
        if epoch < 25:
            return 1e-1 * LR
        if epoch < 35:
            return 1e-2 * LR
        return 1e-3 * LR

    warmup_epochs = args.warmup_epochs
    callbacks = [
        # TODO: Step 8: broadcast initial variable states from the first
        # worker to all others
        hvd.callbacks.BroadcastGlobalVariablesCallback(0),

        # TODO: Step 12: average the metrics among workers at the end of every
        # epoch
        hvd.callbacks.MetricAverageCallback(),

        # TODO: Step 9 part 2: implement a LR warmup over `args.warmup_epochs`
        hvd.callbacks.LearningRateWarmupCallback(
            warmup_epochs=warmup_epochs, verbose=verbose),

        # TODO: Step 9 part 3: replace with the Horovod learning rate
        # scheduler, taking care not to start until after warmup is complete
        hvd.callbacks.LearningRateScheduleCallback(
            lr_schedule, start_epoch=warmup_epochs)
    ]

    if hvd.rank() == 0:
        # TODO: Step 10: only append these 2 callbacks to `callbacks` if they
        # are to be executed by the first worker
        callbacks.append(
            keras.callbacks.ModelCheckpoint(args.checkpoint_format))
        callbacks.append(keras.callbacks.TensorBoard(args.log_dir))

    # Train the model.
    number_of_workers = hvd.size()
    steps_per_epoch = len(train_iter) // number_of_workers
    validation_steps = 3 * len(test_iter) // number_of_workers

    # Train the model.
    if args.profrun:
        steps_per_epoch = 4

    model.fit_generator(train_iter,
                        # TODO: Step 11 part 1: keep the total number of steps
                        # the same in spite of an increased number of workers
                        steps_per_epoch=steps_per_epoch,
                        callbacks=callbacks,
                        epochs=args.epochs,
                        verbose=verbose,
                        workers=number_of_workers,
                        initial_epoch=resume_from_epoch,
                        validation_data=test_iter,
                        # TODO: Step 11 part 2: Set this value to be
                        # 3 * num_test_iterations / number_of_workers
                        validation_steps=validation_steps)

    # Evaluate the model on the full data set.
    score = model.evaluate_generator(test_iter, len(test_iter),
                                     workers=number_of_workers)

    if verbose:
        print('Test loss:', score[0])
        print('Test accuracy:', score[1])

    if hvd.rank() == 0 and args.savegraph:
        graphdef_file = args.savegraph

        session = K.get_session()
        graph_def = session.graph.as_graph_def()
        with open('{}.pb'.format(graphdef_file), 'wb') as f:
            f.write(graph_def.SerializeToString())
        with open('{}.pbtxt'.format(graphdef_file), 'w') as f:
            f.write(str(graph_def))
示例#9
0
def train_and_predict():
    
    # Horovod: initialize Horovod.
    hvd.init()

    # Horovod: pin GPU to be used to process local rank (one GPU per process)
    config = tf.ConfigProto()
    config.intra_op_parallelism_threads = 10
    config.inter_op_parallelism_threads =  1
    K.set_session(tf.Session(config=config))

    print('-'*30)
    print('Loading and preprocessing train data...')
    print('-'*30)
    imgs_train, imgs_mask_train = load_train_data()
    imgs_mask_train=imgs_mask_train[..., np.newaxis]
    #imgs_train = preprocess(imgs_train,'I')
    #imgs_mask_train = preprocess(imgs_mask_train,'M')
#
    print(imgs_train.shape)
    print(imgs_mask_train.shape)
    imgs_train = imgs_train.astype('float32')
    #mean = np.mean(imgs_train)  # mean for data centering
    #std = np.std(imgs_train)  # std for data normalization

    #imgs_train -= mean
    #imgs_train /= std
    imgs_train /= 255.  # scale masks to [0, 1]

    imgs_mask_train = imgs_mask_train.astype('float32')
    imgs_mask_train /= 255.  # scale masks to [0, 1]

    print('-'*30)
    print('Creating and compiling model...')
    print('-'*30)
   
    #resume_from_epoch = 0
    #for try_epoch in range(100, 0, -1):
    #    if os.path.exists('/workspace/checkpoint-{epoch}.h5'.format(epoch=try_epoch)):
    #       resume_from_epoch = try_epoch
    #       break
    resume_from_epoch=int(sys.argv[1])
    print('resume_from_epoch:',resume_from_epoch)
    # resume from latest checkpoint file
    resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch')
    
    verbose = 1 if hvd.rank() == 0 else 0
    
    if resume_from_epoch > 0 and hvd.rank() == 0:
       model = hvd.load_model('/workspace/nddcheckpoint-{epoch}.h5'.format(epoch=resume_from_epoch),custom_objects={'dice_coef':dice_coef,'dice_coef_loss':dice_coef_loss}) 
    else:
       model = get_unet()
    

    print('hvd size:',hvd.size())
    print('learning rate:',.00013*hvd.size())

    print('calculating data start and end indices to distribute data for each worker....')    
    if hvd.size() > 1:
       number_of_examples_per_rank=imgs_train.shape[0]//hvd.size()
       remainder=imgs_train.shape[0]%hvd.size()
    if hvd.rank() < remainder:
       start_index= hvd.rank() * (number_of_examples_per_rank+1)
       end_index= start_index + number_of_examples_per_rank + 1
    else:
       start_index= hvd.rank() * number_of_examples_per_rank + remainder
       end_index= start_index + number_of_examples_per_rank 
    print('Rank''s, Start and End Index:',hvd.rank(),start_index,end_index)

    callbacks = [
    # Horovod: broadcast initial variable states from rank 0 to all other processes.
    # This is necessary to ensure consistent initialization of all workers when
    # training is started with random weights or restored from a checkpoint.
    hvd.callbacks.BroadcastGlobalVariablesCallback(0),
    ]
    if hvd.rank() == 0:
       callbacks.append(keras.callbacks.ModelCheckpoint('/workspace/nddcheckpoint-{epoch}.h5',monitor='val_loss', save_best_only=True))
    
    print('-'*30)
    print('Fitting model...')
    print('-'*30)
    model.fit(imgs_train[start_index:end_index], imgs_mask_train[start_index:end_index], batch_size=12, epochs=resume_from_epoch+10,  shuffle=True,
              validation_split=0.01,initial_epoch=resume_from_epoch,
              callbacks=callbacks,
              verbose=1 if hvd.rank() == 0 else 0)
              #verbose=1)
    if hvd.rank() == 0:
       model.save('/workspace/unetmodelfdd.h5', include_optimizer=False)