def create_model(): model = models.Sequential() model.add( layers.Conv2D(32, (3, 3), activation='relu', input_shape=(150, 150, 3))) model.add(layers.MaxPooling2D((2, 2))) model.add(layers.Conv2D(64, (3, 3), activation='relu')) model.add(layers.MaxPooling2D((2, 2))) model.add(layers.Conv2D(128, (3, 3), activation='relu')) model.add(layers.MaxPooling2D((2, 2))) model.add(layers.MaxPooling2D((2, 2))) model.add(layers.Flatten()) model.add(layers.Dense(64, activation='relu')) model.add(layers.Dense(1, activation='sigmoid')) # Horovod: adjust learning rate based on number of GPUs. opt = optimizers.SGD(0.01 * hvd.size()) # Horovod: add Horovod DistributedOptimizer. opt = hvd.DistributedOptimizer(opt) # Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow # uses hvd.DistributedOptimizer() to compute gradients. model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'], experimental_run_tf_function=False) return model
def _compile_graph(self, model, loss_func='mse', opt_func='adam'): loss_functions = {'mse':'mean_squared_error', \ 'msle':'mean_squared_logarithmic_error', \ 'cc':'categorical_crossentropy', \ 'bce':'binary_crossentropy'} #'bce':BinaryCrossentropy()} #'scc':'sparse_categorical_crossentropy'} - wants a single output opt_functions = {'adam': Adam, 'sgd': SGD, 'rms': RMSprop} logger.debug( "Using the %s optimizer with a learning rate of %s and the %s loss function" % (opt_func, str(self.learning_rate), loss_func)) if hvd: opt = opt_functions[opt_func](lr=self.learning_rate * hvd.size()) if hvd.rank() == 0: logger.debug("Compiling distributed optimizer") opt = hvd.DistributedOptimizer(opt) self.callbacks = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0) ] else: opt = opt_functions[opt_func](lr=self.learning_rate) # compile model.compile(loss=loss_functions[loss_func], optimizer=opt, metrics=['accuracy']) #model.summary() plot_model(model, to_file=os.path.join(self.save_dir, '%s.png' % (self.param_name)))
def test_train_model_lr_schedule(self): lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay( 0.001 * hvd.size(), decay_steps=100000, decay_rate=0.96, staircase=True) opt = tf.keras.optimizers.Adam(lr_schedule) opt = hvd.DistributedOptimizer(opt) model = keras.models.Sequential() model.add(keras.layers.Dense(2, input_shape=(3,))) model.add(keras.layers.RepeatVector(3)) model.add(keras.layers.ThresholdedReLU(0.5)) model.compile(loss=keras.losses.mean_squared_error, optimizer=opt, metrics=[keras.metrics.categorical_accuracy], experimental_run_tf_function=False) x = np.random.random((1, 3)) y = np.random.random((1, 3, 2)) # No assertions, we just need to verify that it doesn't hang or error callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)] model.fit(x, y, steps_per_epoch=10, callbacks=callbacks, epochs=1)
def define_model(): model = models.Sequential() model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same')) model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same')) model.add(layers.MaxPooling2D((2, 2))) model.add(layers.Dropout(0.2)) model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same')) model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same')) model.add(layers.MaxPooling2D((2, 2))) model.add(layers.Dropout(0.2)) model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same')) model.add(layers.Conv2D(128, (3, 3), activation='relu', padding='same')) model.add(layers.MaxPooling2D((2, 2))) model.add(layers.Dropout(0.2)) model.add(layers.Flatten()) model.add(layers.Dense(128, activation='relu')) model.add(layers.Dropout(0.2)) model.add(layers.Dense(10, activation='softmax')) scaled_lr = 0.001 * hvd.size() opt = tf.optimizers.Adam(scaled_lr) opt = hvd.DistributedOptimizer(opt, backward_passes_per_step=1, average_aggregated_gradients=True) model.compile( optimizer=opt, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'], experimental_run_tf_function=False) return model
def create_resnet(): # Build network import keras_resnet_single as networks resnet = networks.ResNet.build( len(channels), resblocks, [16, 32], (125 * granularity, 125 * granularity, len(channels)), granularity) # Load saved weights, if indicated if args.load_epoch != 0: directory = args.save_dir if args.save_dir == '': directory = expt_name model_name = glob.glob('../MODELS/%s/epoch%02d-*.hdf5' % (directory, args.load_epoch))[0] #assert len(model_name) == 2 #model_name = model_name[0].split('.hdf5')[0]+'.hdf5' print('Loading weights from file:', model_name) resnet.load_weights(model_name) #opt = keras.optimizers.Adam(lr=lr_init, epsilon=1.e-5) # changed eps to match pytorch value #opt = keras.optimizers.SGD(lr=lr_init * hvd.size()) opt = NovoGrad(learning_rate=lr_init * hvd.size()) #Wrap the optimizer in a Horovod distributed optimizer -> uses hvd.DistributedOptimizer() to compute gradients. opt = hvd.DistributedOptimizer(opt) #For Horovod: We specify `experimental_run_tf_function=False` to ensure TensorFlow #resnet.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'], experimental_run_tf_function = False) #resnet.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy']) resnet.summary() return resnet
def test_load_model_custom_optimizers(self): class TestOptimizer(keras.optimizers.RMSprop): def __init__(self, **kwargs): super(TestOptimizer, self).__init__(**kwargs) with self.test_session(config=self.config) as sess: K.set_session(sess) opt = TestOptimizer(lr=0.0001) opt = hvd.DistributedOptimizer(opt) model = keras.models.Sequential() model.add(keras.layers.Dense(2, input_shape=(3, ))) model.add(keras.layers.RepeatVector(3)) model.add(keras.layers.TimeDistributed(keras.layers.Dense(3))) model.compile(loss=keras.losses.MSE, optimizer=opt, metrics=[keras.metrics.categorical_accuracy], sample_weight_mode='temporal') x = np.random.random((1, 3)) y = np.random.random((1, 3, 3)) model.train_on_batch(x, y) with temppath() as fname: model.save(fname) custom_optimizers = [TestOptimizer] new_model = hvd.load_model(fname, custom_optimizers=custom_optimizers) new_opt = new_model.optimizer self.assertEqual(type(new_opt).__module__, 'horovod._keras') self.assertEqual(type(new_opt).__name__, 'TestOptimizer') self._check_optimizer_weights(opt, new_opt)
def check_tf_2(aggregation_frequency: int, average_aggregated_gradients: bool) -> None: gpus = tf.config.experimental.list_physical_devices("GPU") for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU") hvd_optimizer = hvd.DistributedOptimizer( optimizer=CustomOptimizer("mine"), backward_passes_per_step=aggregation_frequency, average_aggregated_gradients=average_aggregated_gradients, ) _ = hvd_optimizer.iterations gradients = [tf.constant([float(hvd.rank())])] variables = [tf.Variable([0.0])] for idx in range(10): if _PRE_TF_2_4_0: # In TF < 2.4 `_aggregate_gradients()` is called outside of `apply_gradients()`. updated_gradients = hvd_optimizer._aggregate_gradients( zip(gradients, variables)) hvd_optimizer.apply_gradients( zip(updated_gradients, variables), experimental_aggregate_gradients=False) else: hvd_optimizer.apply_gradients(zip(gradients, variables)) updated_variable_value = variables[0][0].numpy() expected_value = compute_expected_variable_value( idx, aggregation_frequency, average_aggregated_gradients) assert expected_value == updated_variable_value assert idx + 1 == hvd_optimizer.iterations.numpy()
def test_gradient_aggregation(self): with self.test_session(config=self.config) as sess: class TestingOptimizer(optimizer_v2.OptimizerV2): """ Custom optimizer we use for testing gradient aggregation. """ def get_config(self): config = super(TestingOptimizer, self).get_config() return config def _create_slots(self, var_list): pass def _resource_apply_dense(self, grad, var, apply_state=None): return var.assign_add(grad) K.set_session(sess) session = tf.compat.v1.keras.backend.get_session(op_input_list=()) backward_passes_per_step = 4 hvd_optimizer = hvd.DistributedOptimizer( optimizer=TestingOptimizer("test"), backward_passes_per_step=backward_passes_per_step, average_aggregated_gradients=True, ) iterations = hvd_optimizer.iterations session.run(iterations.initializer) def compute_expected_value(batch_id): sum_per_aggregation = 0.0 for _ in range(backward_passes_per_step): grads_for_batch = 0.0 for rank in range(hvd.size()): grads_for_batch += rank # Apply `average_aggregated_gradients`. grads_for_batch /= float(backward_passes_per_step) # Averages across workers. sum_per_aggregation += grads_for_batch / float(hvd.size()) aggregations_completed = math.floor( (batch_id + 1) / backward_passes_per_step) return aggregations_completed * sum_per_aggregation grads = [tf.constant([float(hvd.rank())])] variables = [tf.Variable([0.0])] session.run(variables[0].initializer) allreduce_op = hvd_optimizer._allreduce(grads) grads_and_vars = [(allreduce_op[0], variables[0])] apply_grads_op = hvd_optimizer.apply_gradients(grads_and_vars) for idx in range(10): _ = session.run(apply_grads_op) assert idx + 1 == session.run(hvd_optimizer.iterations) assert session.run( variables[0].read_value()) == compute_expected_value(idx)
def get_model(input_shape, learning_rate, weight_decay, optimizer, momentum, hvd): input_tensor = Input(shape=input_shape) base_model = keras.applications.resnet50.ResNet50( include_top=False, weights=None, input_tensor=input_tensor, input_shape=input_shape, classes=None) x = Flatten()(base_model.output) predictions = Dense(NUM_CLASSES, activation='softmax')(x) model = Model(inputs=base_model.input, outputs=predictions) size = hvd.size() if optimizer.lower() == 'sgd': opt = SGD(lr=learning_rate * size, decay=weight_decay, momentum=momentum) elif optimizer.lower() == 'rmsprop': opt = RMSprop(lr=learning_rate * size, decay=weight_decay) else: opt = Adam(lr=learning_rate * size, decay=weight_decay) opt = hvd.DistributedOptimizer(opt) model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) return model
def model_compile(self): optimizer_fn = U.selectOptimizer_keras(self.optimizer_name) decay_rate = self.learning_rate / self.num_epochs if self.num_epochs > 0 else 1 opti_parameters = signature(optimizer_fn).parameters params = {} if "lr" in opti_parameters: params["lr"] = self.learning_rate if "epsilon" in opti_parameters: params["epsilon"] = self.optimizer_eps if "decay" in opti_parameters: decay_rate = ( self.learning_rate / self.num_epochs if self.num_epochs > 0 else 1 ) params["decay"] = decay_rate self.optimizer = hvd.DistributedOptimizer(optimizer_fn(**params)) if type(self.loss_metrics) is dict: self.model.compile( optimizer=self.optimizer, loss=self.loss_metrics, loss_weights=self.loss_weights, metrics=self.metrics_name, ) else: self.model.compile( optimizer=self.optimizer, loss=self.loss_metrics, metrics=self.metrics_name, )
def train_model(self, backward_passes_per_step): with self.test_session(config=self.config) as sess: K.set_session(sess) opt = keras.optimizers.RMSprop(lr=0.0001) opt = hvd.DistributedOptimizer( opt, backward_passes_per_step=backward_passes_per_step, average_aggregated_gradients=True) model = keras.models.Sequential() model.add(keras.layers.Dense(2, input_shape=(3, ))) model.add(keras.layers.RepeatVector(3)) model.add(keras.layers.ThresholdedReLU(0.5)) model.compile(loss=keras.losses.mean_squared_error, optimizer=opt, metrics=[keras.metrics.categorical_accuracy], sample_weight_mode='temporal') x = np.random.random((1, 3)) y = np.random.random((1, 3, 3)) def generator(): while 1: yield (x, y) # No assertions, we just need to verify that it doesn't hang callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)] model.fit_generator(generator(), steps_per_epoch=10, callbacks=callbacks, epochs=0, verbose=0, workers=4, initial_epoch=1)
def check_tf_1(aggregation_frequency: int, average_aggregated_gradients: bool) -> None: config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) session = tf.compat.v1.keras.backend.get_session(op_input_list=()) hvd_optimizer = hvd.DistributedOptimizer( optimizer=CustomOptimizer("mine"), backward_passes_per_step=aggregation_frequency, average_aggregated_gradients=average_aggregated_gradients, ) iterations = hvd_optimizer.iterations session.run(iterations.initializer) grads = [tf.constant([float(hvd.rank())])] variables = [tf.Variable([0.0])] session.run(variables[0].initializer) allreduce_op = hvd_optimizer._allreduce(grads) grads_and_vars = [(allreduce_op[0], variables[0])] apply_grads_op = hvd_optimizer.apply_gradients(grads_and_vars) for idx in range(10): _ = session.run(apply_grads_op) expected_value = compute_expected_variable_value( idx, aggregation_frequency, average_aggregated_gradients) assert idx + 1 == session.run(hvd_optimizer.iterations) assert expected_value == session.run(variables[0].read_value())
def test_load_model(self): with self.test_session(config=self.config) as sess: K.set_session(sess) opt = keras.optimizers.RMSprop(lr=0.0001) opt = hvd.DistributedOptimizer(opt) model = keras.models.Sequential() model.add(keras.layers.Dense(2, input_shape=(3, ))) model.add(keras.layers.RepeatVector(3)) model.add(keras.layers.TimeDistributed(keras.layers.Dense(3))) model.compile(loss=keras.losses.MSE, optimizer=opt, metrics=[keras.metrics.categorical_accuracy], sample_weight_mode='temporal') x = np.random.random((1, 3)) y = np.random.random((1, 3, 3)) model.train_on_batch(x, y) with temppath() as fname: model.save(fname) new_model = hvd.load_model(fname) new_opt = new_model.optimizer self.assertEqual(type(new_opt).__module__, 'horovod._keras') self.assertEqual(type(new_opt).__name__, 'RMSprop') self.assertEqual(K.get_value(opt.lr), K.get_value(new_opt.lr)) self._check_optimizer_weights(opt, new_opt)
def __init__(self, config: Seq2SeqConfig): """ Initialize model for training. :param config: seq2seq config from input data """ self.body_count = config.body_count self.max_body_length = config.max_body_length self.subject_count = config.subject_count self.max_subject_length = config.max_subject_length self.body_word_to_index = config.body_word_to_index self.body_index_to_word = config.body_index_to_word self.subject_word_to_index = config.subject_word_to_index self.subject_index_to_word = config.subject_index_to_word self.config = config.__dict__ encoder_inputs: Input = Input(shape=(None,), name="encoder_inputs") encoder_embedding: Embedding = Embedding( input_dim=self.body_count, output_dim=self.hidden_units, input_length=self.max_body_length, name="encoder_embedding", ) encoder_lstm: LSTM = LSTM(units=self.hidden_units, return_state=True, name="encoder_lstm") _, encoder_hidden_state, encoder_cell_state = encoder_lstm(encoder_embedding(encoder_inputs)) encoder_states: List[np.ndarray] = [encoder_hidden_state, encoder_cell_state] decoder_inputs: Input = Input(shape=(None, self.subject_count), name="decoder_inputs") decoder_lstm: LSTM = LSTM( units=self.hidden_units, return_state=True, return_sequences=True, name="decoder_lstm" ) decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states) decoder_dense = Dense(units=self.subject_count, activation="softmax", name="decoder_dense") decoder_outputs = decoder_dense(decoder_outputs) # Horovod: add Horovod Distributed Optimizer. try: optimizer = RMSprop(1.0 * hvd.size()) optimizer = hvd.DistributedOptimizer(optimizer) except ValueError: print("Running outside Horovod.") optimizer = RMSprop(1.0) model: Model = Model([encoder_inputs, decoder_inputs], decoder_outputs) model.compile( loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"], experimental_run_tf_function=False, ) self.model = model self.encoder_model = Model(encoder_inputs, encoder_states) decoder_state_inputs: List[Input] = [Input(shape=(self.hidden_units,)), Input(shape=(self.hidden_units,))] decoder_outputs, hidden_state, cell_state = decoder_lstm(decoder_inputs, initial_state=decoder_state_inputs) decoder_states: List[Dense] = [hidden_state, cell_state] decoder_outputs = decoder_dense(decoder_outputs) self.decoder_model = Model([decoder_inputs] + decoder_state_inputs, [decoder_outputs] + decoder_states)
def get_optimizer(self): """ Model compile optimizer :return: Return model compile optimizer """ opt = tf.keras.optimizers.SGD(learning_rate=self.learning_rate*hvd.size(), momentum=0.9) opt = hvd.DistributedOptimizer(opt) return opt
def get_optimizer(self): """ Model compile optimizer :return: Return model compile optimizer """ opt = tf.optimizers.Adam(self.learning_rate * hvd.size()) opt = hvd.DistributedOptimizer(opt) return opt
def adapt_optimizer(opt): if ('str' == opt.__class__.__name__): opt = get_optimizer_by_name(opt) opt_config = opt.get_config() try: opt_config['learning_rate'] *= hvd.size() except KeyError: opt_config['lr'] *= hvd.size() return hvd.DistributedOptimizer(opt.from_config(opt_config))
def wrap_optimizer(backbone_optimizer): """ Wraps a optimizer for parallel GPU usage. :param backbone_optimizer: :return: wrapped optimizer """ if hvd is None or not is_initialized(): return backbone_optimizer optimizer = hvd.DistributedOptimizer(backbone_optimizer) return optimizer
def handle_fp16_and_distributed_optimizer(optimizer, lr_schedule, hvd_backend=None): if hvd_backend == "horovod": import horovod.tensorflow.keras as hvd from horovod.tensorflow import Compression elif hvd_backend == "byteps": import byteps.tensorflow.keras as hvd from byteps.tensorflow import Compression if hvd_backend: compression = Compression.none if compat.CUSTOM_GLOBAL_FLOATX == "float16": compression = Compression.fp16 if lr_schedule is not None and hvd_backend is None: # TODO(ZhaoChengqi): pay attention to API changes optimizer._set_hyper("learning_rate", lr_schedule) # specify the following scenario if compat.CUSTOM_GLOBAL_FLOATX == "float16": if compat.IS_PREV_TF_2_4_0: from tensorflow.keras.mixed_precision.experimental import LossScaleOptimizer from tensorflow.python.keras import backend from tensorflow.python.training.experimental.loss_scale import get_loss_scale_weights revised_loss_scale = RevisedDynamicLossScale() if hvd_backend: opt = LossScaleOptimizer(optimizer, loss_scale=1) opt = hvd.DistributedOptimizer(opt, compression=compression, sparse_as_dense=True) opt._loss_scale = revised_loss_scale for weight in get_loss_scale_weights(opt._loss_scale): backend.track_variable(weight) opt._track_trackable(opt._loss_scale, 'loss_scale', overwrite=True) else: opt = LossScaleOptimizer(optimizer, loss_scale=revised_loss_scale) else: if hvd_backend: opt = HorovodDistributedLossScaleOptimizer( inner_optimizer=optimizer, compression=compression, sparse_as_dense=True, hvd_backend=hvd_backend) else: opt = tf.keras.mixed_precision.LossScaleOptimizer(optimizer) opt._loss_scale = RevisedDynamicLossScale( initial_loss_scale=2**15, growth_steps=2000, multiplier=2) opt._track_trackable(opt._loss_scale, "loss_scale", overwrite=True) return opt return optimizer
def setup_horovod(self): import horovod.tensorflow.keras as hvd hvd.init() self.model = self.model_creator(self.config) compile_args = self.compile_args_creator(self.config) compile_args["optimizer"] = hvd.DistributedOptimizer(compile_args["optimizer"]) self.model.compile(**compile_args) self.backend = "horovod"
def test_from_config(self): opt = keras.optimizers.Adam() hopt = hvd.DistributedOptimizer(opt) cfg = hopt.get_config() hopt_copy1 = hopt.from_config(cfg) self.assertEqual(cfg, hopt_copy1.get_config()) hopt_copy2 = hopt.__class__.from_config(cfg) self.assertEqual(cfg, hopt_copy2.get_config())
def get_unet(lrate=1e-5): inputs = Input((IMG_SIZE, IMG_SIZE, 3)) conv1 = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs) conv1 = Conv2D(32, (3, 3), activation='relu', padding='same')(conv1) pool1 = MaxPooling2D(pool_size=(2, 2))(conv1) conv2 = Conv2D(64, (3, 3), activation='relu', padding='same')(pool1) conv2 = Conv2D(64, (3, 3), activation='relu', padding='same')(conv2) pool2 = MaxPooling2D(pool_size=(2, 2))(conv2) conv3 = Conv2D(128, (3, 3), activation='relu', padding='same')(pool2) conv3 = Conv2D(128, (3, 3), activation='relu', padding='same')(conv3) pool3 = MaxPooling2D(pool_size=(2, 2))(conv3) conv4 = Conv2D(256, (3, 3), activation='relu', padding='same')(pool3) conv4 = Conv2D(256, (3, 3), activation='relu', padding='same')(conv4) pool4 = MaxPooling2D(pool_size=(2, 2))(conv4) conv5 = Conv2D(512, (3, 3), activation='relu', padding='same')(pool4) conv5 = Conv2D(512, (3, 3), activation='relu', padding='same')(conv5) up6 = concatenate([Conv2DTranspose(256, (2, 2), strides=(2, 2), padding='same')(conv5), conv4], axis=3) conv6 = Conv2D(256, (3, 3), activation='relu', padding='same')(up6) conv6 = Conv2D(256, (3, 3), activation='relu', padding='same')(conv6) up7 = concatenate([Conv2DTranspose(128, (2, 2), strides=(2, 2), padding='same')(conv6), conv3], axis=3) conv7 = Conv2D(128, (3, 3), activation='relu', padding='same')(up7) conv7 = Conv2D(128, (3, 3), activation='relu', padding='same')(conv7) up8 = concatenate([Conv2DTranspose(64, (2, 2), strides=(2, 2), padding='same')(conv7), conv2], axis=3) conv8 = Conv2D(64, (3, 3), activation='relu', padding='same')(up8) conv8 = Conv2D(64, (3, 3), activation='relu', padding='same')(conv8) up9 = concatenate([Conv2DTranspose(32, (2, 2), strides=(2, 2), padding='same')(conv8), conv1], axis=3) conv9 = Conv2D(32, (3, 3), activation='relu', padding='same')(up9) conv9 = Conv2D(32, (3, 3), activation='relu', padding='same')(conv9) convLast = BatchNormalization()(conv9) conv10 = Conv2D(1, (1, 1), activation='sigmoid')(convLast) model = Model(inputs=[inputs], outputs=[conv10]) model.compile(loss=d_loss, metrics=[d_coef], optimizer=hvd.DistributedOptimizer(Adam(lr=lrate)), experimental_run_tf_function=False) if hvd.rank() == 0: print(model.summary) return model
def _build_model(self): #创建一个三层的神经网络 """Build Neural Net for Deep Q-learning Model""" model = Sequential() model.add(Dense(24, input_dim=self.state_size, activation='relu')) model.add(Dense(24, activation='relu')) model.add(Dense(self.action_size, activation='linear')) #model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) model.compile(loss='mse', optimizer=htk.DistributedOptimizer( Adam(lr=self.learning_rate))) return model
def train_hvd(learning_rate=1.0): # Tensorflow has given up on pickling. We need to explicitly import its modules inside workers from tensorflow.keras import backend as K from tensorflow.keras.models import Sequential import tensorflow as tf from tensorflow import keras import horovod.tensorflow.keras as hvd # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) K.set_session(tf.Session(config=config)) (x_train, y_train), (x_test, y_test) = get_dataset(num_classes, hvd.rank(), hvd.size()) model = get_model(num_classes) # Horovod: adjust learning rate based on number of GPUs. optimizer = keras.optimizers.Adadelta(lr=learning_rate * hvd.size()) # Horovod: add Horovod Distributed Optimizer. optimizer = hvd.DistributedOptimizer(optimizer) model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy']) callbacks = [ # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. hvd.callbacks.BroadcastGlobalVariablesCallback(0), ] # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. if hvd.rank() == 0: callbacks.append( keras.callbacks.ModelCheckpoint(checkpoint_dir + '/checkpoint-{epoch}.ckpt', save_weights_only=True)) model.fit(x_train, y_train, batch_size=batch_size, callbacks=callbacks, epochs=epochs, verbose=2, validation_data=(x_test, y_test))
def setup_horovod(self): import horovod.tensorflow.keras as hvd hvd.init() self.model = self.model_creator(self.config) compile_args = self.compile_args_creator(self.config) compile_args["optimizer"] = hvd.DistributedOptimizer(compile_args["optimizer"]) self.model.compile(**compile_args) self.backend = "horovod" self.size = hvd.size() self.rank = hvd.rank() from tensorflow.python.distribute import distribution_strategy_context as ds_context self.strategy = ds_context.get_strategy()
def get_optimizer(name, distributed=False, **opt_args): #lr, lr_scaling='linear', n_ranks=1, """Configure the optimizer""" # Construct the optimizer OptType = getattr(keras.optimizers, name) opt = OptType(**opt_args) # Distributed optimizer wrapper if distributed: opt = hvd.DistributedOptimizer(opt) return opt
def test_from_config(self): with self.test_session(config=self.config) as sess: K.set_session(sess) opt = keras.optimizers.Adam() hopt = hvd.DistributedOptimizer(opt) cfg = hopt.get_config() hopt_copy1 = hopt.from_config(cfg) self.assertEqual(cfg, hopt_copy1.get_config()) hopt_copy2 = hopt.__class__.from_config(cfg) self.assertEqual(cfg, hopt_copy2.get_config())
def train(data_dir=None, output_dir=None, model_dir=None, epochs=1, learning_rate=0.01, beta_1=0.9, beta_2=0.99, epsilon=1e-07, optimizer='Adam'): dataset = EMGrapheneDataset(data_dir=data_dir) opt = tf.keras.optimizers.Adam(learning_rate=learning_rate, beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, amsgrad=False, name=optimizer) opt = hvd.DistributedOptimizer(opt) loss = tf.keras.losses.MeanSquaredError() model = autoencoder(dataset.input_shape) model.compile(loss=loss, optimizer=opt, experimental_run_tf_function=False) hooks = [ hvd.callbacks.BroadcastGlobalVariablesCallback(0), hvd.callbacks.MetricAverageCallback(), ] if hvd.rank() == 0: # These hooks only need to be called by one instance. # Therefore we need to only add them on rank == 0 tracker_hook = TrackingCallback(output_dir, 256, False) hooks.append(tracker_hook) model.fit(dataset.to_dataset(), epochs=epochs, callbacks=hooks) if hvd.rank() == 0: model_dir = Path(model_dir) weight_path = str(model_dir / 'weights') os.mkdir(weight_path) weights_file = str(model_dir / 'weights/final_weights.h5') model.save_weights(weights_file) os.mkdir(model_dir / 'models') model_path = str(model_dir / "models") model.save(model_path) print("weight path: ", os.listdir(weight_path)) print("models path: ", os.listdir(model_path))
def create_model(): opt = keras.optimizers.SGD(lr=0.01 * hvd.size(), momentum=0.9) opt = hvd.DistributedOptimizer(opt) model = keras.models.Sequential() model.add(keras.layers.Dense(2, input_shape=(3, ))) model.add(keras.layers.RepeatVector(3)) model.add(keras.layers.TimeDistributed(keras.layers.Dense(3))) model.compile(loss=keras.losses.MSE, optimizer=opt, metrics=[keras.metrics.categorical_accuracy], sample_weight_mode='temporal') return model
def test_sparse_as_dense(self): opt = keras.optimizers.RMSprop(lr=0.0001) opt = hvd.DistributedOptimizer(opt, sparse_as_dense=True) model = keras.models.Sequential() model.add(keras.layers.Embedding(1000, 64, input_length=10)) model.compile(loss=keras.losses.mean_squared_error, optimizer=opt, experimental_run_tf_function=False) x = np.random.randint(1000, size=(32, 10)) y = np.random.random((32, 10, 64)) # No assertions, we just need to verify that it doesn't hang model.train_on_batch(x, y)