def test_load_model(self): with self.test_session(config=self.config) as sess: K.set_session(sess) opt = keras.optimizers.RMSprop(lr=0.0001) opt = hvd.DistributedOptimizer(opt) model = keras.models.Sequential() model.add(keras.layers.Dense(2, input_shape=(3,))) model.add(keras.layers.RepeatVector(3)) model.add(keras.layers.TimeDistributed(keras.layers.Dense(3))) model.compile(loss=keras.losses.MSE, optimizer=opt, metrics=[keras.metrics.categorical_accuracy], sample_weight_mode='temporal') x = np.random.random((1, 3)) y = np.random.random((1, 3, 3)) model.train_on_batch(x, y) with temppath() as fname: model.save(fname) new_model = hvd.load_model(fname) new_opt = new_model.optimizer self.assertEqual(type(new_opt).__module__, 'horovod._keras') self.assertEqual(type(new_opt).__name__, 'RMSprop') self.assertEqual(K.get_value(opt.lr), K.get_value(new_opt.lr)) self._check_optimizer_weights(opt, new_opt)
def test_load_model_custom_optimizers(self): class TestOptimizer(keras.optimizers.RMSprop): def __init__(self, **kwargs): super(TestOptimizer, self).__init__(**kwargs) with self.test_session(config=self.config) as sess: K.set_session(sess) opt = TestOptimizer(lr=0.0001) opt = hvd.DistributedOptimizer(opt) model = keras.models.Sequential() model.add(keras.layers.Dense(2, input_shape=(3,))) model.add(keras.layers.RepeatVector(3)) model.add(keras.layers.TimeDistributed(keras.layers.Dense(3))) model.compile(loss=keras.losses.MSE, optimizer=opt, metrics=[keras.metrics.categorical_accuracy], sample_weight_mode='temporal') x = np.random.random((1, 3)) y = np.random.random((1, 3, 3)) model.train_on_batch(x, y) with temppath() as fname: model.save(fname) custom_optimizers = [TestOptimizer] new_model = hvd.load_model(fname, custom_optimizers=custom_optimizers) new_opt = new_model.optimizer self.assertEqual(type(new_opt).__module__, 'horovod._keras') self.assertEqual(type(new_opt).__name__, 'TestOptimizer') self._check_optimizer_weights(opt, new_opt)
def test_load_model(self): with self.test_session(config=self.config) as sess: K.set_session(sess) opt = keras.optimizers.RMSprop(lr=0.0001) opt = hvd.DistributedOptimizer(opt) model = keras.models.Sequential() model.add(keras.layers.Dense(2, input_shape=(3, ))) model.add(keras.layers.RepeatVector(3)) model.add(keras.layers.TimeDistributed(keras.layers.Dense(3))) model.compile(loss=keras.losses.MSE, optimizer=opt, metrics=[keras.metrics.categorical_accuracy], sample_weight_mode='temporal') x = np.random.random((1, 3)) y = np.random.random((1, 3, 3)) model.train_on_batch(x, y) _, fname = tempfile.mkstemp('.h5') model.save(fname) new_model = hvd.load_model(fname) new_opt = new_model.optimizer os.remove(fname) self.assertEqual(type(new_opt).__module__, 'horovod._keras') self.assertEqual(type(new_opt).__name__, 'RMSprop') self.assertEqual(K.get_value(opt.lr), K.get_value(new_opt.lr)) self.assertEqual(len(opt.get_weights()), len(new_opt.get_weights())) for weights, new_weights in zip(opt.get_weights(), new_opt.get_weights()): self.assertListEqual(weights.tolist(), new_weights.tolist())
def test_load_model_broadcast(self): def create_model(): opt = keras.optimizers.SGD(lr=0.01 * hvd.size(), momentum=0.9) opt = hvd.DistributedOptimizer(opt) model = keras.models.Sequential() model.add(keras.layers.Dense(2, input_shape=(3, ))) model.add(keras.layers.RepeatVector(3)) model.add(keras.layers.TimeDistributed(keras.layers.Dense(3))) model.compile(loss=keras.losses.MSE, optimizer=opt, metrics=[keras.metrics.categorical_accuracy], sample_weight_mode='temporal') return model with self.test_session(config=self.config) as sess: K.set_session(sess) model = create_model() x = np.random.random((1, 3)) y = np.random.random((1, 3, 3)) model.train_on_batch(x, y) if hvd.rank() == 0: _, fname = tempfile.mkstemp('.h5') model.save(fname) K.clear_session() with self.test_session(config=self.config) as sess: K.set_session(sess) if hvd.rank() == 0: model = hvd.load_model(fname) os.remove(fname) else: model = create_model() def generator(): while 1: yield (x, y) if hvd.rank() == 0: self.assertEqual(len(model.optimizer.weights), 5) else: self.assertEqual(len(model.optimizer.weights), 0) # No assertions, we just need to verify that it doesn't hang callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)] model.fit_generator(generator(), steps_per_epoch=10, callbacks=callbacks, epochs=0, verbose=0, workers=4, initial_epoch=1) self.assertEqual(len(model.optimizer.weights), 5)
def test_load_model_custom_objects(self): hvd.init() class TestOptimizer(keras.optimizers.RMSprop): def __init__(self, **kwargs): super(TestOptimizer, self).__init__(**kwargs) with self.test_session() as sess: K.set_session(sess) opt = TestOptimizer(lr=0.0001) opt = hvd.DistributedOptimizer(opt) model = keras.models.Sequential() model.add(keras.layers.Dense(2, input_shape=(3, ))) model.add(keras.layers.RepeatVector(3)) model.add(keras.layers.TimeDistributed(keras.layers.Dense(3))) model.compile(loss=keras.losses.MSE, optimizer=opt, metrics=[keras.metrics.categorical_accuracy], sample_weight_mode='temporal') x = np.random.random((1, 3)) y = np.random.random((1, 3, 3)) model.train_on_batch(x, y) _, fname = tempfile.mkstemp('.h5') model.save(fname) custom_objects = { 'TestOptimizer': lambda **kwargs: hvd.DistributedOptimizer( TestOptimizer(**kwargs)) } new_model = hvd.load_model(fname, custom_objects=custom_objects) new_opt = new_model.optimizer os.remove(fname) self.assertEqual(type(new_opt).__module__, 'horovod.keras.impl') self.assertEqual(type(new_opt).__name__, 'TestOptimizer') self.assertEqual(K.get_value(opt.lr), K.get_value(new_opt.lr)) self.assertEqual(len(opt.get_weights()), len(new_opt.get_weights())) for weights, new_weights in zip(opt.get_weights(), new_opt.get_weights()): self.assertListEqual(weights.tolist(), new_weights.tolist())
def test_load_model_custom_objects(self): hvd.init() class TestOptimizer(keras.optimizers.RMSprop): def __init__(self, **kwargs): super(TestOptimizer, self).__init__(**kwargs) with self.test_session() as sess: K.set_session(sess) opt = TestOptimizer(lr=0.0001) opt = hvd.DistributedOptimizer(opt) model = keras.models.Sequential() model.add(keras.layers.Dense(2, input_shape=(3,))) model.add(keras.layers.RepeatVector(3)) model.add(keras.layers.TimeDistributed(keras.layers.Dense(3))) model.compile(loss=keras.losses.MSE, optimizer=opt, metrics=[keras.metrics.categorical_accuracy], sample_weight_mode='temporal') x = np.random.random((1, 3)) y = np.random.random((1, 3, 3)) model.train_on_batch(x, y) _, fname = tempfile.mkstemp('.h5') model.save(fname) custom_objects = { 'TestOptimizer': lambda **kwargs: hvd.DistributedOptimizer( TestOptimizer(**kwargs)) } new_model = hvd.load_model(fname, custom_objects=custom_objects) new_opt = new_model.optimizer os.remove(fname) self.assertEqual(type(new_opt).__module__, 'horovod.keras') self.assertEqual(type(new_opt).__name__, 'TestOptimizer') self.assertEqual(K.get_value(opt.lr), K.get_value(new_opt.lr)) self.assertEqual(len(opt.get_weights()), len(new_opt.get_weights())) for weights, new_weights in zip(opt.get_weights(), new_opt.get_weights()): self.assertListEqual(weights.tolist(), new_weights.tolist())
height_shift_range=0.2) train_gen.fit(x_train) train_iter = train_gen.flow(x_train, y_train, batch_size=args.batch_size) # Validation data iterator. test_gen = image.ImageDataGenerator(featurewise_center=True, featurewise_std_normalization=True) test_gen.mean = train_gen.mean test_gen.std = train_gen.std test_iter = test_gen.flow(x_test, y_test, batch_size=args.val_batch_size) # Restore from a previous checkpoint, if initial_epoch is specified. # Horovod: restore on the first worker which will broadcast both model and optimizer weights # to other workers. if resume_from_epoch > 0 and hvd.rank() == 0: model = hvd.load_model( args.checkpoint_format.format(epoch=resume_from_epoch)) else: # Set up standard WideResNet-16-10 model. model = WideResidualNetwork(depth=16, width=10, weights=None, input_shape=input_shape, classes=num_classes, dropout_rate=0.01) # WideResNet model that is included with Keras is optimized for inference. # Add L2 weight decay & adjust BN settings. model_config = model.get_config() for layer, layer_config in zip(model.layers, model_config['layers']): if hasattr(layer, 'kernel_regularizer'): regularizer = keras.regularizers.l2(args.wd)
target_size=(224, 224)) # Set up standard ResNet-50 model. model = keras.applications.resnet50.ResNet50(weights=None) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Horovod: adjust learning rate based on number of GPUs. initial_lr = args.base_lr * hvd.size() # Restore from a previous checkpoint, if initial_epoch is specified. # Horovod: restore on the first worker which will broadcast both model and optimizer weights # to other workers. if resume_from_epoch > 0 and hvd.rank() == 0: model = hvd.load_model(args.checkpoint_format.format(epoch=resume_from_epoch), compression=compression) else: # ResNet-50 model that is included with Keras is optimized for inference. # Add L2 weight decay & adjust BN settings. model_config = model.get_config() for layer, layer_config in zip(model.layers, model_config['layers']): if hasattr(layer, 'kernel_regularizer'): regularizer = keras.regularizers.l2(args.wd) layer_config['config']['kernel_regularizer'] = \ {'class_name': regularizer.__class__.__name__, 'config': regularizer.get_config()} if type(layer) == keras.layers.BatchNormalization: layer_config['config']['momentum'] = 0.9 layer_config['config']['epsilon'] = 1e-5 model = keras.models.Model.from_config(model_config)
def test_load_model_broadcast(self): hvd.init() def create_model(): opt = keras.optimizers.SGD(lr=0.01 * hvd.size(), momentum=0.9) opt = hvd.DistributedOptimizer(opt) model = keras.models.Sequential() model.add(keras.layers.Dense(2, input_shape=(3,))) model.add(keras.layers.RepeatVector(3)) model.add(keras.layers.TimeDistributed(keras.layers.Dense(3))) model.compile(loss=keras.losses.MSE, optimizer=opt, metrics=[keras.metrics.categorical_accuracy], sample_weight_mode='temporal') return model with self.test_session() as sess: K.set_session(sess) model = create_model() x = np.random.random((1, 3)) y = np.random.random((1, 3, 3)) model.train_on_batch(x, y) if hvd.rank() == 0: _, fname = tempfile.mkstemp('.h5') model.save(fname) K.clear_session() with self.test_session() as sess: K.set_session(sess) if hvd.rank() == 0: model = hvd.load_model(fname) os.remove(fname) else: model = create_model() def generator(): while 1: yield (x, y) if hvd.rank() == 0: self.assertEqual(len(model.optimizer.weights), 5) else: self.assertEqual(len(model.optimizer.weights), 0) # No assertions, we just need to verify that it doesn't hang callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)] model.fit_generator(generator(), steps_per_epoch=10, callbacks=callbacks, epochs=0, verbose=0, workers=4, initial_epoch=1) self.assertEqual(len(model.optimizer.weights), 5)
target_size=(224, 224)) # Validation data iterator. test_gen = image.ImageDataGenerator( zoom_range=(0.875, 0.875), preprocessing_function=keras.applications.resnet50.preprocess_input) test_iter = test_gen.flow_from_directory(test_dir, batch_size=batch_size, target_size=(224, 224)) # Set up standard ResNet-50 model. model = keras.applications.resnet50.ResNet50(weights=None) # Restore from a previous checkpoint, if initial_epoch is specified. # Horovod: restore on the first worker which will broadcast both model and optimizer weights # to other workers. if resume_from_epoch > 0 and hvd.rank() == 0: model = hvd.load_model(checkpoint_format.format(epoch=resume_from_epoch)) else: # ResNet-50 model that is included with Keras is optimized for inference. # Add L2 weight decay & adjust BN settings. model_config = model.get_config() for layer, layer_config in zip(model.layers, model_config['layers']): if hasattr(layer, 'kernel_regularizer'): regularizer = keras.regularizers.l2(weight_decay) layer_config['config']['kernel_regularizer'] = \ {'class_name': regularizer.__class__.__name__, 'config': regularizer.get_config()} if type(layer) == keras.layers.BatchNormalization: layer_config['config']['momentum'] = 0.9 layer_config['config']['epsilon'] = 1e-5 model = keras.models.Model.from_config(model_config)
def main(): # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.compat.v1.ConfigProto(allow_soft_placement=True) config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1 config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.compat.v1.Session(config=config)) # If set > 0, will resume training from a given checkpoint. resume_from_epoch = 0 for try_epoch in range(args.epochs, 0, -1): if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)): resume_from_epoch = try_epoch break # Horovod: broadcast resume_from_epoch from rank 0 (which will have # checkpoints) to other ranks. resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch') # Horovod: print logs on the first worker. verbose = 1 if hvd.rank() == 0 else 0 # Training data iterator. train_gen = image.ImageDataGenerator() #width_shift_range=0.33, height_shift_range=0.33, zoom_range=0.5, horizontal_flip=True, #preprocessing_function=keras.applications.resnet50.preprocess_input) train_iter = train_gen.flow_from_directory(args.train, batch_size=args.batch_size, target_size=(224, 224)) # Validation data iterator. test_gen = image.ImageDataGenerator() #zoom_range=(0.875, 0.875), preprocessing_function=keras.applications.resnet50.preprocess_input) test_iter = test_gen.flow_from_directory(args.val, batch_size=args.val_batch_size, target_size=(224, 224)) # train iterator for tfrecord train_iter_tf = iterator(args.train_dir) val_iter_tf = iterator(args.val_dir) # timeline #timeline = tf.train.ProfilerHook(save_steps=500, output_dir='./timeline') #run_options = tf.compat.v1.RunOptions(trace_level=tf.compat.v1.RunOptions.FULL_TRACE) #run_metadata = tf.compat.v1.RunMetadata() # Set up standard ResNet-50 model. model = keras.applications.resnet50.ResNet50(weights=None) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Restore from a previous checkpoint, if initial_epoch is specified. # Horovod: restore on the first worker which will broadcast both model and optimizer weights # to other workers. if resume_from_epoch > 0 and hvd.rank() == 0: model = hvd.load_model( args.checkpoint_format.format(epoch=resume_from_epoch), compression=compression) else: # ResNet-50 model that is included with Keras is optimized for inference. # Add L2 weight decay & adjust BN settings. model_config = model.get_config() for layer, layer_config in zip(model.layers, model_config['layers']): if hasattr(layer, 'kernel_regularizer'): regularizer = keras.regularizers.l2(args.wd) layer_config['config']['kernel_regularizer'] = \ {'class_name': regularizer.__class__.__name__, 'config': regularizer.get_config()} if type(layer) == keras.layers.BatchNormalization: layer_config['config']['momentum'] = 0.9 layer_config['config']['epsilon'] = 1e-5 model = keras.models.Model.from_config(model_config) # Horovod: adjust learning rate based on number of GPUs. opt = keras.optimizers.SGD(lr=args.base_lr * hvd.size(), momentum=args.momentum) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer(opt, compression=compression) model.compile(loss=keras.losses.categorical_crossentropy, optimizer=opt, metrics=['accuracy', 'top_k_categorical_accuracy']) # options=run_options, # run_metadata=run_metadata # ) callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0), # Horovod: average metrics among workers at the end of every epoch. # # Note: This callback must be in the list before the ReduceLROnPlateau, # TensorBoard, or other metrics-based callbacks. hvd.callbacks.MetricAverageCallback(), # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during # the first five epochs. See https://arxiv.org/abs/1706.02677 for details. hvd.callbacks.LearningRateWarmupCallback( warmup_epochs=args.warmup_epochs, verbose=verbose), # Horovod: after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs. hvd.callbacks.LearningRateScheduleCallback( start_epoch=args.warmup_epochs, end_epoch=30, multiplier=1.), hvd.callbacks.LearningRateScheduleCallback(start_epoch=30, end_epoch=60, multiplier=1e-1), hvd.callbacks.LearningRateScheduleCallback(start_epoch=60, end_epoch=80, multiplier=1e-2), hvd.callbacks.LearningRateScheduleCallback(start_epoch=80, multiplier=1e-3), ] # Horovod: save checkpoints only on the first worker to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks.append( keras.callbacks.ModelCheckpoint(args.checkpoint_format)) callbacks.append(keras.callbacks.TensorBoard(args.log_dir)) # Train the model. The training will randomly sample 1 / N batches of training data and # 3 / N batches of validation data on every worker, where N is the number of workers. # Over-sampling of validation data helps to increase probability that every validation # example will be evaluated. print('---- train len------ :', len(train_iter)) print('---- test len------ :', len(test_iter)) total_train_step = len(train_iter) total_val_step = len(test_iter) #model.fit_generator(train_iter, model.fit( train_iter_tf, #steps_per_epoch=40037 // hvd.size(), steps_per_epoch=total_train_step // hvd.size(), callbacks=callbacks, epochs=args.epochs, verbose=verbose, workers=8, initial_epoch=resume_from_epoch, validation_data=val_iter_tf, validation_steps=3 * total_val_step // hvd.size()) # timeline tracing #trace = timeline.Timeline(step_stats=run_metadata.step_stats) #with open ('./timeline.keras.json','w') as f: # f.write(trace.generate_chrome_trace_format()) # Evaluate the model on the full data set. score = hvd.allreduce( model.evaluate_generator(test_iter, len(test_iter), workers=4)) if verbose: print('Test loss:', score[0]) print('Test accuracy:', score[1])
zoom_range=(0.875, 0.875), preprocessing_function=keras.applications.resnet50.preprocess_input) test_iter = test_gen.flow_from_directory(args.val_dir, batch_size=args.val_batch_size, target_size=(224, 224)) # Set up standard ResNet-50 model. model = keras.applications.resnet50.ResNet50(weights=None) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Restore from a previous checkpoint, if initial_epoch is specified. # Horovod: restore on the first worker which will broadcast both model and optimizer weights # to other workers. if resume_from_epoch > 0 and hvd.rank() == 0: model = hvd.load_model(args.checkpoint_format.format(epoch=resume_from_epoch), compression=compression) else: # ResNet-50 model that is included with Keras is optimized for inference. # Add L2 weight decay & adjust BN settings. model_config = model.get_config() for layer, layer_config in zip(model.layers, model_config['layers']): if hasattr(layer, 'kernel_regularizer'): regularizer = keras.regularizers.l2(args.wd) layer_config['config']['kernel_regularizer'] = \ {'class_name': regularizer.__class__.__name__, 'config': regularizer.get_config()} if type(layer) == keras.layers.BatchNormalization: layer_config['config']['momentum'] = 0.9 layer_config['config']['epsilon'] = 1e-5 model = keras.models.Model.from_config(model_config)
def train_and_predict(): # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.intra_op_parallelism_threads = 10 config.inter_op_parallelism_threads = 1 K.set_session(tf.Session(config=config)) print('-'*30) print('Loading and preprocessing train data...') print('-'*30) imgs_train, imgs_mask_train = load_train_data() imgs_mask_train=imgs_mask_train[..., np.newaxis] #imgs_train = preprocess(imgs_train,'I') #imgs_mask_train = preprocess(imgs_mask_train,'M') # print(imgs_train.shape) print(imgs_mask_train.shape) imgs_train = imgs_train.astype('float32') #mean = np.mean(imgs_train) # mean for data centering #std = np.std(imgs_train) # std for data normalization #imgs_train -= mean #imgs_train /= std imgs_train /= 255. # scale masks to [0, 1] imgs_mask_train = imgs_mask_train.astype('float32') imgs_mask_train /= 255. # scale masks to [0, 1] print('-'*30) print('Creating and compiling model...') print('-'*30) #resume_from_epoch = 0 #for try_epoch in range(100, 0, -1): # if os.path.exists('/workspace/checkpoint-{epoch}.h5'.format(epoch=try_epoch)): # resume_from_epoch = try_epoch # break resume_from_epoch=int(sys.argv[1]) print('resume_from_epoch:',resume_from_epoch) # resume from latest checkpoint file resume_from_epoch = hvd.broadcast(resume_from_epoch, 0, name='resume_from_epoch') verbose = 1 if hvd.rank() == 0 else 0 if resume_from_epoch > 0 and hvd.rank() == 0: model = hvd.load_model('/workspace/nddcheckpoint-{epoch}.h5'.format(epoch=resume_from_epoch),custom_objects={'dice_coef':dice_coef,'dice_coef_loss':dice_coef_loss}) else: model = get_unet() print('hvd size:',hvd.size()) print('learning rate:',.00013*hvd.size()) print('calculating data start and end indices to distribute data for each worker....') if hvd.size() > 1: number_of_examples_per_rank=imgs_train.shape[0]//hvd.size() remainder=imgs_train.shape[0]%hvd.size() if hvd.rank() < remainder: start_index= hvd.rank() * (number_of_examples_per_rank+1) end_index= start_index + number_of_examples_per_rank + 1 else: start_index= hvd.rank() * number_of_examples_per_rank + remainder end_index= start_index + number_of_examples_per_rank print('Rank''s, Start and End Index:',hvd.rank(),start_index,end_index) callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0), ] if hvd.rank() == 0: callbacks.append(keras.callbacks.ModelCheckpoint('/workspace/nddcheckpoint-{epoch}.h5',monitor='val_loss', save_best_only=True)) print('-'*30) print('Fitting model...') print('-'*30) model.fit(imgs_train[start_index:end_index], imgs_mask_train[start_index:end_index], batch_size=12, epochs=resume_from_epoch+10, shuffle=True, validation_split=0.01,initial_epoch=resume_from_epoch, callbacks=callbacks, verbose=1 if hvd.rank() == 0 else 0) #verbose=1) if hvd.rank() == 0: model.save('/workspace/unetmodelfdd.h5', include_optimizer=False)