def main(args): #initialize Horovod. hvd.init() config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) fold = args.data_path.split("fold_")[1] if hvd.rank()==0: print("================================") if args.use_lovasz: print("Fine tuning with ") print("Fold {}".format(fold)) #Find best saved model best_model_file = 'weights/{}/fold_{}_{epoch}_best.h5'.format(args.model, fold, epoch='{epoch}') resume_from_epoch = 0 for try_epoch in range(args.epochs, 0, -1): if os.path.exists(best_model_file.format(epoch=try_epoch)): resume_from_epoch = try_epoch break if hvd.rank()==0: print("Last model saved: {}".format(best_model_file.format(epoch=resume_from_epoch))) resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch') #verbose mode for one node if hvd.rank()==0: verbose = 1 else: verbose = 0 #Create dataset dataset = TGSDataset(data_path=args.data_path, batch_size=args.batch_size) input_shape = (args.target_size, args.target_size) mask_shape = (101, 101) train_data_generator = dataset.get_train_data_generator(input_size=input_shape, mask_size=mask_shape, seed=np.random.rand()) val_data_generator = dataset.get_val_data_generator(input_size=input_shape, mask_size=mask_shape, seed=np.random.rand()) train_step_size = dataset.train_step_size // hvd.size() val_step_size = dataset.val_step_size // hvd.size() #Create model model = make_model(args.model, (args.target_size, args.target_size, 3), 2) #load weights if resume_from_epoch > 0: model.load_weights(best_model_file.format(epoch=resume_from_epoch)) size = hvd.size() opt = hvd.DistributedOptimizer(SGD(lr=args.learning_rate * size, momentum=0.9, nesterov=True)) #Loss loss = losses.c_lovasz_loss if args.use_lovasz else losses.c_binary_crossentropy model.compile(loss=loss, optimizer=opt, metrics=[metrics.c_binary_accuracy, metrics.c_iou]) #h5 model best_model = ModelCheckpointMGPU(model, filepath=best_model_file, monitor='val_loss', verbose=1, mode='min', period=1, save_best_only=True, save_weights_only=True) callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0), # Horovod: average metrics among workers at the end of every epoch. # # Note: This callback must be in the list before the ReduceLROnPlateau, # TensorBoard, or other metrics-based callbacks. hvd.callbacks.MetricAverageCallback(), # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during # the first five epochs. See https://arxiv.org/abs/1706.02677 for details. hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=args.warmup_epochs, verbose=True) ] # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks.append(keras.callbacks.TensorBoard(args.log_dir)) callbacks.append(best_model) #Fit model history = model.fit_generator(train_data_generator, steps_per_epoch=train_step_size, callbacks=callbacks, epochs=args.epochs, verbose=verbose, workers=4, initial_epoch=resume_from_epoch, validation_data=val_data_generator, validation_steps=val_step_size) score = hvd.allreduce(model.evaluate_generator(val_data_generator, val_step_size, workers=4)) print('Test loss:', score[0]) print('Test accuracy:', score[1])
config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) # If set > 0, will resume training from a given checkpoint. resume_from_epoch = 0 for try_epoch in range(args.epochs, 0, -1): if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)): resume_from_epoch = try_epoch break # Horovod: broadcast resume_from_epoch from rank 0 (which will have # checkpoints) to other ranks. resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch') # Horovod: print logs on the first worker. verbose = 1 if hvd.rank() == 0 else 0 # Input image dimensions img_rows, img_cols = 28, 28 num_classes = 10 # Load Fashion MNIST data. (x_train, y_train), (x_test, y_test) = load_data(args.dataset_path) if K.image_data_format() == 'channels_first': x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
def main(): verbose = 1 logger = _get_logger() if _DISTRIBUTED: # Horovod: initialize Horovod. hvd.init() logger.info("Runnin Distributed") verbose = 1 if hvd.rank() == 0 else 0 logger.info("Tensorflow version {}".format(tf.__version__)) K.set_session(tf.Session(config=_get_runconfig())) # Horovod: broadcast resume_from_epoch from rank 0 (which will have # checkpoints) to other ranks. resume_from_epoch = 0 if _DISTRIBUTED: resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name="resume_from_epoch") if _FAKE: train_iter = _fake_data_iterator_from() else: train_iter = _training_data_iterator_from() test_iter = _validation_data_iterator_from() if _VALIDATION else None model = _create_model() params = {"learning_rate": _LR, "momentum": 0.9} opt = _get_optimizer(params) model.compile( loss=keras.losses.categorical_crossentropy, optimizer=opt, metrics=["accuracy", "top_k_categorical_accuracy"], ) model_dir = _get_model_dir() checkpoint_format = os.path.join(model_dir, "checkpoint-{epoch}.h5") callbacks = _get_hooks() callbacks.append(LoggerCallback(logger, len(train_iter) * _BATCHSIZE)) # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them. if _is_master(): callbacks.append(keras.callbacks.ModelCheckpoint(checkpoint_format)) # callbacks.append(keras.callbacks.TensorBoard(log_dir)) # Restore from a previous checkpoint, if initial_epoch is specified. # Horovod: restore on the first worker which will broadcast weights to other workers. if resume_from_epoch > 0 and _is_master(): model.load_weights(checkpoint_format.format(epoch=resume_from_epoch)) logger.info("Training...") # Train the model. The training will randomly sample 1 / N batches of training data and # 3 / N batches of validation data on every worker, where N is the number of workers. # Over-sampling of validation data helps to increase probability that every validation # example will be evaluated. num_workers = hvd.size() if _DISTRIBUTED else 1 model.fit_generator( train_iter, steps_per_epoch=len(train_iter) // num_workers, callbacks=callbacks, epochs=_EPOCHS, verbose=verbose, workers=_NUM_WORKERS, max_queue_size=_MAX_QUEUE_SIZE, use_multiprocessing=_MULTIPROCESSING, initial_epoch=resume_from_epoch, ) if _FAKE is False and _VALIDATION: # Evaluate the model on the full data set. with Timer(output=logger.info, prefix="Testing"): logger.info("Testing...") score = hvd.allreduce( model.evaluate_generator(test_iter, len(test_iter), workers=10)) if verbose: print("Test loss:", score[0]) print("Test accuracy:", score[1])
def main(lossfunction="tversky", lossrate=1e-4, depth=7, optimizer="rms", n_filters=32, fixed=False, resnet=False, bands=[0,1,2,3,4,5], batchnorm=True, dropout=False, dropout_rate=0.10, noise=False, noise_rate=0.1, ramp=False, earlystop=False): verbose = 0 if not useHorovod: verbose = 1 elif hvd.rank() == 0: verbose = 2 if verbose > 0: logger.info("using bands %s", bands) train_file = '../../data/train' val_file = '../../data/val' test_file = '../../data/test' #basepath = '/scratch2/BMC/gsd-hpcs/Jebb.Q.Stewart/git/gsd-machine-learning/src/cwb/ci/data/' #train_file = basepath + '/cwbci_512_l30_7bands_radar_train' #val_file = basepath + '/cwbci_512_l30_7bands_radar_val' #test_file = basepath + '/cwbci_512_l30_7bands_radar_test' if verbose > 0: logger.info('reading in train data') x_train, y_train = readData(train_file) x_val, y_val = readData(val_file) x_test, y_test = readData(test_file) #sample = np.min(100, x_train.shape[0]) sample = 100 if verbose > 0: logger.info("Sample data: ") logger.info(" Training input : max[0]: %s", np.max(x_train[sample,:,:,0])) logger.info(" min[0]: %s", np.min(x_train[sample,:,:,0])) logger.info(" shape : %s", x_train.shape) logger.info(" Train labels: max[0]: %s min[1]: %s dtype: %s", np.max(y_train[sample,:,:,:]), np.min(y_train[sample,:,:,:]), y_train.dtype) logger.info(" using loss function %s", lossfunction) loss = tversky_loss(alpha=0.3, beta=0.7) if lossfunction == "dice": loss = dice_loss elif lossfunction == "tversky2": loss = tversky_loss2(alpha=0.7, beta=0.3) elif lossfunction == "tversky3": loss = tversky_loss2(alpha=0.2, beta=0.8) elif lossfunction == "bcedice": loss = bce_dice_loss elif lossfunction == "focal": loss = focal_loss3(gamma=2) elif lossfunction == "bce": loss = 'binary_crossentropy' elif lossfunction == "mse": loss = 'mse' elif lossfunction == "rmse": loss = 'rmse' channels = len(bands)*3 model = unet(img_rows=512, img_cols=512, channels=channels, output_channels=1, fixed=fixed, batchnorm=batchnorm, resnet=resnet, n_filters=n_filters, depth=depth, dropout=dropout, dropout_rate=dropout_rate, noise=noise, noise_rate=noise_rate, final_activation='sigmoid', verbose=verbose) if useHorovod: opt = hvd.DistributedOptimizer(RMSprop(lr=lossrate*hvd.size())) if optimizer == "adam": opt = hvd.DistributedOptimizer(Adam(lr=lossrate*hvd.size())) else: opt = RMSprop(lr=lossrate) if optimizer == "adam": opt = Adam(lr=lossrate) model.compile(optimizer=opt, loss=loss, metrics=[tversky_coeff(alpha=0.3, beta=0.7), dice_coeff, 'accuracy']) if verbose > 0: logger.info("Model Summary:\n%s", model.summary()) logger.info("Estimated Model GPU usage: %s GB", get_model_memory_usage(batch_size, model)) logger.info("Current host memory usage: %s", usage()); # serialize model to JSON model_json = model.to_json() if not os.path.isdir("models"): os.makedirs("models") model_file = "models/" + model_name + ".json" with open(model_file, "w") as json_file: json_file.write(model_json) logger.info("saved model to %s", model_file) callbacks = [] if useHorovod: callbacks.append( hvd.callbacks.BroadcastGlobalVariablesCallback(0)) callbacks.append( hvd.callbacks.MetricAverageCallback()) callbacks.append( hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=verbose)) if earlystop: callbacks.append(EarlyStopping(monitor='val_loss', patience=30, verbose=verbose, min_delta=1e-4, restore_best_weights=True)) if ramp: if useHorovod: # Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs. callbacks.append(hvd.callbacks.LearningRateScheduleCallback(start_epoch=15, end_epoch=40, multiplier=1.)) callbacks.append(hvd.callbacks.LearningRateScheduleCallback(start_epoch=40, end_epoch=70, multiplier=1e-1)) callbacks.append(hvd.callbacks.LearningRateScheduleCallback(start_epoch=70, end_epoch=100, multiplier=1e-2)) callbacks.append(hvd.callbacks.LearningRateScheduleCallback(start_epoch=100, multiplier=1e-3)) # Reduce the learning rate if training plateaues. #keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1)] #ReduceLROnPlateau(monitor='val_loss', # factor=0.1, # patience=4, # verbose=1, # min_delta=1e-4), training_bg = generator(x_train, y_train, batch_size, limit=80000, bands=bands) val_bg = generator(x_val, y_val, batch_size, limit=40000, bands=bands) test_bg = generator(x_test, y_test, batch_size, limit=20000, bands=bands) if useHorovod: training_bg.order = hvd.broadcast(training_bg.order, 0, name='training_bg_order').numpy() val_bg.order = hvd.broadcast(val_bg.order, 0, name='val_bg_order').numpy() test_bg.order = hvd.broadcast(test_bg.order, 0, name='test_bg_order').numpy() if verbose > 0: logger.info("Training size: %s : steps : %s", training_bg.length, (training_bg.length//batch_size)) logger.info("Validation size: %s : steps : %s", val_bg.length, (val_bg.length//batch_size)) # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them. if not useHorovod or hvd.rank() == 0: if not os.path.isdir("checkpoints"): os.makedirs("checkpoints") callbacks.append(keras.callbacks.ModelCheckpoint('./checkpoints/' + model_name + '_checkpoint-{epoch:02d}-{val_loss:.3f}.hdf5', monitor='val_loss', save_best_only=True, save_weights_only=True)) #callbacks.append(keras.callbacks.TensorBoard(log_dir='tflogs')) size = 1 if useHorovod: size = hvd.size() history = model.fit_generator(generator=training_bg, steps_per_epoch=(training_bg.length//batch_size) // size, epochs=epochs, verbose=verbose, callbacks=callbacks, validation_data=val_bg, validation_steps=(val_bg.length // batch_size) // size, shuffle=True, use_multiprocessing=False, workers=2, max_queue_size=8) if not useHorovod or hvd.rank() == 0: # serialize weights to HDF5 logger.info("saving weights") if not os.path.isdir("weights"): os.makedirs("weights") weights_file = "weights/" + model_name + ".h5" model.save_weights(weights_file) logger.info("Saved weights to disk %s", weights_file) logger.info("evaluating results") scores = model.evaluate_generator(generator=test_bg, steps=(test_bg.length//batch_size) // size, workers=2, max_queue_size=8, use_multiprocessing=False, verbose=verbose) if not useHorovod or hvd.rank() == 0: logger.info('Test scores: %s', scores) if not os.path.isdir("images"): os.makedirs("images") # plt.xkcd() plt.plot(history.history['accuracy']) plt.plot(history.history['val_accuracy']) plt.title('model accuracy') plt.ylabel('accuracy') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') plt.savefig("images/" + model_name +"_acc.png") plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('model loss') plt.ylabel('loss') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') plt.savefig("images/" + model_name +"_loss.png")
test_dir = os.path.expanduser('~/imagenet/validation') # Checkpoint format and log directory. checkpoint_format = './checkpoint-{epoch}.h5' log_dir = './logs' # If set > 0, will resume training from a given checkpoint. resume_from_epoch = 0 for try_epoch in range(epochs, 0, -1): if os.path.exists(checkpoint_format.format(epoch=try_epoch)): resume_from_epoch = try_epoch break # Horovod: broadcast resume_from_epoch from rank 0 (which will have # checkpoints) to other ranks. resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch') # Horovod: print logs on the first worker. verbose = 1 if hvd.rank() == 0 else 0 # Training data iterator. train_gen = image.ImageDataGenerator( width_shift_range=0.33, height_shift_range=0.33, zoom_range=0.5, horizontal_flip=True, preprocessing_function=keras.applications.resnet50.preprocess_input) train_iter = train_gen.flow_from_directory(train_dir, batch_size=batch_size, target_size=(224, 224)) # Validation data iterator. test_gen = image.ImageDataGenerator( zoom_range=(0.875, 0.875), preprocessing_function=keras.applications.resnet50.preprocess_input) test_iter = test_gen.flow_from_directory(test_dir, batch_size=batch_size,
def main(): # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.compat.v1.ConfigProto(allow_soft_placement=True) config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1 config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.compat.v1.Session(config=config)) # If set > 0, will resume training from a given checkpoint. resume_from_epoch = 0 for try_epoch in range(args.epochs, 0, -1): if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)): resume_from_epoch = try_epoch break # Horovod: broadcast resume_from_epoch from rank 0 (which will have # checkpoints) to other ranks. resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch') # Horovod: print logs on the first worker. verbose = 1 if hvd.rank() == 0 else 0 # Training data iterator. train_gen = image.ImageDataGenerator() #width_shift_range=0.33, height_shift_range=0.33, zoom_range=0.5, horizontal_flip=True, #preprocessing_function=keras.applications.resnet50.preprocess_input) train_iter = train_gen.flow_from_directory(args.train, batch_size=args.batch_size, target_size=(224, 224)) # Validation data iterator. test_gen = image.ImageDataGenerator() #zoom_range=(0.875, 0.875), preprocessing_function=keras.applications.resnet50.preprocess_input) test_iter = test_gen.flow_from_directory(args.val, batch_size=args.val_batch_size, target_size=(224, 224)) # train iterator for tfrecord train_iter_tf = iterator(args.train_dir) val_iter_tf = iterator(args.val_dir) # timeline #timeline = tf.train.ProfilerHook(save_steps=500, output_dir='./timeline') #run_options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE) #run_metadata = tf.compat.v1.RunMetadata() # Set up standard ResNet-50 model. model = keras.applications.resnet50.ResNet50(weights=None) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Restore from a previous checkpoint, if initial_epoch is specified. # Horovod: restore on the first worker which will broadcast both model and optimizer weights # to other workers. if resume_from_epoch > 0 and hvd.rank() == 0: model = hvd.load_model( args.checkpoint_format.format(epoch=resume_from_epoch), compression=compression) else: # ResNet-50 model that is included with Keras is optimized for inference. # Add L2 weight decay & adjust BN settings. model_config = model.get_config() for layer, layer_config in zip(model.layers, model_config['layers']): if hasattr(layer, 'kernel_regularizer'): regularizer = keras.regularizers.l2(args.wd) layer_config['config']['kernel_regularizer'] = \ {'class_name': regularizer.__class__.__name__, 'config': regularizer.get_config()} if type(layer) == keras.layers.BatchNormalization: layer_config['config']['momentum'] = 0.9 layer_config['config']['epsilon'] = 1e-5 model = keras.models.Model.from_config(model_config) # Horovod: adjust learning rate based on number of GPUs. opt = keras.optimizers.SGD(lr=args.base_lr * hvd.size(), momentum=args.momentum) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt, compression=compression) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy', 'top_k_categorical_accuracy']) # options=run_options, # run_metadata=run_metadata # ) callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0), # Horovod: average metrics among workers at the end of every epoch. # # Note: This callback must be in the list before the ReduceLROnPlateau, # TensorBoard, or other metrics-based callbacks. hvd.callbacks.MetricAverageCallback(), # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during # the first five epochs. See https://arxiv.org/abs/1706.02677 for details. hvd.callbacks.LearningRateWarmupCallback( warmup_epochs=args.warmup_epochs, verbose=verbose), # Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs. hvd.callbacks.LearningRateScheduleCallback( start_epoch=args.warmup_epochs, end_epoch=30, multiplier=1.), hvd.callbacks.LearningRateScheduleCallback(start_epoch=30, end_epoch=60, multiplier=1e-1), hvd.callbacks.LearningRateScheduleCallback(start_epoch=60, end_epoch=80, multiplier=1e-2), hvd.callbacks.LearningRateScheduleCallback(start_epoch=80, multiplier=1e-3), ] # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks.append( keras.callbacks.ModelCheckpoint(args.checkpoint_format)) callbacks.append(keras.callbacks.TensorBoard(args.log_dir)) # Train the model. The training will randomly sample 1 / N batches of training data and # 3 / N batches of validation data on every worker, where N is the number of workers. # Over-sampling of validation data helps to increase probability that every validation # example will be evaluated. print('---- train len------ :', len(train_iter)) print('---- test len------ :', len(test_iter)) total_train_step = len(train_iter) total_val_step = len(test_iter) #model.fit_generator(train_iter, model.fit( train_iter_tf, #steps_per_epoch=40037 // hvd.size(), steps_per_epoch=total_train_step // hvd.size(), callbacks=callbacks, epochs=args.epochs, verbose=verbose, workers=8, initial_epoch=resume_from_epoch, validation_data=val_iter_tf, validation_steps=3 * total_val_step // hvd.size()) # timeline tracing #trace = timeline.Timeline(step_stats=run_metadata.step_stats) #with open ('./timeline.keras.json','w') as f: # f.write(trace.generate_chrome_trace_format()) # Evaluate the model on the full data set. score = hvd.allreduce( model.evaluate_generator(test_iter, len(test_iter), workers=4)) if verbose: print('Test loss:', score[0]) print('Test accuracy:', score[1])
def run_cvae(cm_file_train, cm_file_val, batch_size=32, hyper_dim=3, epochs=100): hvd.init() gen_train = CVAEGenerator(cm_file_train, hvd_size=hvd.size(), batch_size=batch_size, shuffle=True) gen_val = CVAEGenerator(cm_file_val, hvd_size=hvd.size(), batch_size=batch_size, shuffle=True) input_shape = gen_train.get_shape() config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) #epochs = int(math.ceil(epochs / hvd.size())) #cvae = CVAE(input_shape[1:], hyper_dim, lr=0.001*hvd.size()) cvae = CVAE(input_shape[1:], hyper_dim, lr=0.001) cvae.optimizer = hvd.DistributedOptimizer(cvae.optimizer) cvae.model.compile(optimizer=cvae.optimizer, loss=cvae._vae_loss) model_weight = 'cvae_weight-{epoch}.h5' model_file = 'cvae_model-{epoch}.h5' loss_file = 'loss.npz' resume_from_epoch = 0 for try_epoch in range(epochs, 0, -1): if os.path.exists(model_weight.format(epoch=try_epoch)): resume_from_epoch = try_epoch break resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch') if resume_from_epoch > 0: cvae.model.load_weights(model_weight.format(epoch=resume_from_epoch)) callbacks = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0), ] #callbacks.append(lms_callback) if hvd.rank() == 0: callbacks.append(cvae.history) #callbacks.append(keras.callbacks.TensorBoard('./logs')) #callbacks.append(keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) # callback = EmbeddingCallback(cm_data_train, cvae) cvae.train(gen_train, validation_data=gen_val, batch_size=batch_size, epochs=epochs, initial_epoch=resume_from_epoch, callbacks=callbacks) if hvd.rank() == 0: cvae.model.save_weights(model_weight.format(epoch=epochs)) cvae.save(model_file.format(epoch=epochs)) losses = {'loss': [], 'val_loss': []} if resume_from_epoch > 0: losses = np.load(loss_file) train_losses = np.concatenate([losses['loss'], cvae.history.losses]) val_losses = np.concatenate( [losses['val_loss'], cvae.history.val_losses]) np.savez(loss_file, loss=train_losses, val_loss=val_losses) return cvae
def main(): parser = argparse.ArgumentParser( description='Keras Fashion MNIST Example', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--log-dir', default='./logs', help='tensorboard log directory') parser.add_argument('--batch-size', type=int, default=32, help='input batch size for training') parser.add_argument('--val-batch-size', type=int, default=32, help='input batch size for validation') parser.add_argument('--epochs', type=int, default=40, help='number of epochs to train') parser.add_argument('--base-lr', type=float, default=0.01, help='learning rate for a single GPU') parser.add_argument('--momentum', type=float, default=0.9, help='SGD momentum') parser.add_argument('--wd', type=float, default=0.000005, help='weight decay') # TODO: Step 9 part 1: register `--warmup-epochs` parser.add_argument('--warmup-epochs', type=float, default=5, help='number of warmup epochs') GRAPHDEF_FILE = 'graphdef' parser.add_argument( '--savegraph', action='store', nargs='?', const=GRAPHDEF_FILE, help='Save graphdef pb and pbtxt files. ' '(default: {})'.format(GRAPHDEF_FILE)) parser.add_argument( '--profrun', action='store_true', help='Run for nsys/dlprof profiling. Runs only a few steps.') args = parser.parse_args() # Checkpoints will be written in the log directory. args.checkpoint_format = \ os.path.join(args.log_dir, 'checkpoint-{epoch}.h5') print('AMP MIXED', os.environ.get("TF_ENABLE_AUTO_MIXED_PRECISION")) # TODO: Step 2 work here: initialize horovod hvd.init() # TODO: Step 3 work here: pin GPUs # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) # If set > 0, will resume training from a given checkpoint. resume_from_epoch = 0 for try_epoch in range(args.epochs, 0, -1): if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)): resume_from_epoch = try_epoch break # TODO: Step 4 work here: broadcast `resume_from_epoch` from first process # to all others with tf.Session(config=config): resume_from_epoch = \ hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch') # TODO: Step 5 work here: only set `verbose` to `1` if this is the # first worker verbose = 1 if hvd.rank() == 0 else 0 # Input image dimensions img_rows, img_cols = 28, 28 num_classes = 10 # Download and load FASHION MNIST dataset. if hvd.rank() == 0: # Load Fashion MNIST data. (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data() with tf.Session(config=config): # download/unzip in rank 0 only. hvd.allreduce([0], name="Barrier") if hvd.rank() != 0: # Load Fashion MNIST data. (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data() if K.image_data_format() == 'channels_first': x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) input_shape = (1, img_rows, img_cols) else: x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) input_shape = (img_rows, img_cols, 1) # Convert class vectors to binary class matrices y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) # Training data iterator. train_gen = image.ImageDataGenerator( featurewise_center=True, featurewise_std_normalization=True, horizontal_flip=True, width_shift_range=0.2, height_shift_range=0.2) train_gen.fit(x_train) train_iter = train_gen.flow(x_train, y_train, batch_size=args.batch_size) # Validation data iterator. test_gen = image.ImageDataGenerator( featurewise_center=True, featurewise_std_normalization=True) test_gen.mean = train_gen.mean test_gen.std = train_gen.std test_iter = test_gen.flow(x_test, y_test, batch_size=args.val_batch_size) base_lr = args.base_lr LR = base_lr * hvd.size() # Restore from a previous checkpoint, if initial_epoch is specified. # if resume_from_epoch > 0 and hvd.rank() == 0: if resume_from_epoch > 0: # TODO: Step 6 work here: only execute the `if` statement if this is # the first worker # If this is only done in rank 0 get following errors: # horovod/common/operations.cc:764] One or more tensors were # submitted to be reduced, gathered or broadcasted by subset of # ranks and are waiting for remainder of ranks model = keras.models.load_model( args.checkpoint_format.format(epoch=resume_from_epoch)) else: # Set up standard WideResNet-16-10 model. model = WideResidualNetwork( depth=16, width=10, weights=None, input_shape=input_shape, classes=num_classes, dropout_rate=0.01) # WideResNet model that is included with Keras is optimized for # inference. Add L2 weight decay & adjust BN settings. model_config = model.get_config() for layer, layer_config in zip(model.layers, model_config['layers']): if hasattr(layer, 'kernel_regularizer'): regularizer = keras.regularizers.l2(args.wd) layer_config['config']['kernel_regularizer'] = \ {'class_name': regularizer.__class__.__name__, 'config': regularizer.get_config()} if type(layer) == keras.layers.BatchNormalization: layer_config['config']['momentum'] = 0.9 layer_config['config']['epsilon'] = 1e-5 model = keras.models.Model.from_config(model_config) # TODO: Step 7 part 1 work here: increase the base learning rate by the # number of workers opt = keras.optimizers.SGD( lr=LR, momentum=args.momentum) # TODO: Step 7 part 2 work here: Wrap the optimizer in a Horovod # distributed optimizer opt_dist = hvd.DistributedOptimizer(opt) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt_dist, metrics=['accuracy']) def lr_schedule(epoch): # global LR if epoch < 15: return LR if epoch < 25: return 1e-1 * LR if epoch < 35: return 1e-2 * LR return 1e-3 * LR warmup_epochs = args.warmup_epochs callbacks = [ # TODO: Step 8: broadcast initial variable states from the first # worker to all others hvd.callbacks.BroadcastGlobalVariablesCallback(0), # TODO: Step 12: average the metrics among workers at the end of every # epoch hvd.callbacks.MetricAverageCallback(), # TODO: Step 9 part 2: implement a LR warmup over `args.warmup_epochs` hvd.callbacks.LearningRateWarmupCallback( warmup_epochs=warmup_epochs, verbose=verbose), # TODO: Step 9 part 3: replace with the Horovod learning rate # scheduler, taking care not to start until after warmup is complete hvd.callbacks.LearningRateScheduleCallback( lr_schedule, start_epoch=warmup_epochs) ] if hvd.rank() == 0: # TODO: Step 10: only append these 2 callbacks to `callbacks` if they # are to be executed by the first worker callbacks.append( keras.callbacks.ModelCheckpoint(args.checkpoint_format)) callbacks.append(keras.callbacks.TensorBoard(args.log_dir)) # Train the model. number_of_workers = hvd.size() steps_per_epoch = len(train_iter) // number_of_workers validation_steps = 3 * len(test_iter) // number_of_workers # Train the model. if args.profrun: steps_per_epoch = 4 model.fit_generator(train_iter, # TODO: Step 11 part 1: keep the total number of steps # the same in spite of an increased number of workers steps_per_epoch=steps_per_epoch, callbacks=callbacks, epochs=args.epochs, verbose=verbose, workers=number_of_workers, initial_epoch=resume_from_epoch, validation_data=test_iter, # TODO: Step 11 part 2: Set this value to be # 3 * num_test_iterations / number_of_workers validation_steps=validation_steps) # Evaluate the model on the full data set. score = model.evaluate_generator(test_iter, len(test_iter), workers=number_of_workers) if verbose: print('Test loss:', score[0]) print('Test accuracy:', score[1]) if hvd.rank() == 0 and args.savegraph: graphdef_file = args.savegraph session = K.get_session() graph_def = session.graph.as_graph_def() with open('{}.pb'.format(graphdef_file), 'wb') as f: f.write(graph_def.SerializeToString()) with open('{}.pbtxt'.format(graphdef_file), 'w') as f: f.write(str(graph_def))
def train_and_predict(): # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.intra_op_parallelism_threads = 10 config.inter_op_parallelism_threads = 1 K.set_session(tf.Session(config=config)) print('-'*30) print('Loading and preprocessing train data...') print('-'*30) imgs_train, imgs_mask_train = load_train_data() imgs_mask_train=imgs_mask_train[..., np.newaxis] #imgs_train = preprocess(imgs_train,'I') #imgs_mask_train = preprocess(imgs_mask_train,'M') # print(imgs_train.shape) print(imgs_mask_train.shape) imgs_train = imgs_train.astype('float32') #mean = np.mean(imgs_train) # mean for data centering #std = np.std(imgs_train) # std for data normalization #imgs_train -= mean #imgs_train /= std imgs_train /= 255. # scale masks to [0, 1] imgs_mask_train = imgs_mask_train.astype('float32') imgs_mask_train /= 255. # scale masks to [0, 1] print('-'*30) print('Creating and compiling model...') print('-'*30) #resume_from_epoch = 0 #for try_epoch in range(100, 0, -1): # if os.path.exists('/workspace/checkpoint-{epoch}.h5'.format(epoch=try_epoch)): # resume_from_epoch = try_epoch # break resume_from_epoch=int(sys.argv[1]) print('resume_from_epoch:',resume_from_epoch) # resume from latest checkpoint file resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch') verbose = 1 if hvd.rank() == 0 else 0 if resume_from_epoch > 0 and hvd.rank() == 0: model = hvd.load_model('/workspace/nddcheckpoint-{epoch}.h5'.format(epoch=resume_from_epoch),custom_objects={'dice_coef':dice_coef,'dice_coef_loss':dice_coef_loss}) else: model = get_unet() print('hvd size:',hvd.size()) print('learning rate:',.00013*hvd.size()) print('calculating data start and end indices to distribute data for each worker....') if hvd.size() > 1: number_of_examples_per_rank=imgs_train.shape[0]//hvd.size() remainder=imgs_train.shape[0]%hvd.size() if hvd.rank() < remainder: start_index= hvd.rank() * (number_of_examples_per_rank+1) end_index= start_index + number_of_examples_per_rank + 1 else: start_index= hvd.rank() * number_of_examples_per_rank + remainder end_index= start_index + number_of_examples_per_rank print('Rank''s, Start and End Index:',hvd.rank(),start_index,end_index) callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0), ] if hvd.rank() == 0: callbacks.append(keras.callbacks.ModelCheckpoint('/workspace/nddcheckpoint-{epoch}.h5',monitor='val_loss', save_best_only=True)) print('-'*30) print('Fitting model...') print('-'*30) model.fit(imgs_train[start_index:end_index], imgs_mask_train[start_index:end_index], batch_size=12, epochs=resume_from_epoch+10, shuffle=True, validation_split=0.01,initial_epoch=resume_from_epoch, callbacks=callbacks, verbose=1 if hvd.rank() == 0 else 0) #verbose=1) if hvd.rank() == 0: model.save('/workspace/unetmodelfdd.h5', include_optimizer=False)