def init_horovod_if_needed(self): for _ in range(DEFAULT_MAX_ALLREDUCE_RETRY_NUM): rank_response = self._master_client.get_comm_rank() if rank_response.rank_id < 0: logger.warning( "The master has not added the worker host into " "rendezvous yet. Retrying to get rank" ) time.sleep(5) else: break # If the rendezvous from master is unequal to self._rendezvous_id, # the worker should rebuild the communication because the master # has updated the communication group. if rank_response.rendezvous_id != self._rendezvous_id: os.environ[HorovodEnv.RENDEZVOUS_PORT] = str( rank_response.rendezvous_port ) os.environ[HorovodEnv.RANK] = str(rank_response.rank_id) os.environ[HorovodEnv.SIZE] = str(rank_response.world_size) hvd.shutdown() hvd.init() self._world_size = hvd.size() self._rendezvous_id = rank_response.rendezvous_id self._need_broadcast = True
def test_train_minibatch(self): self._trainer.init_horovod_if_needed() features = tf.constant([[0.5], [0.6], [0.7]]) labels = tf.constant([[1.0], [0.0], [1.0]]) _, version, loss = self._trainer.train_minibatch(features, labels) self.assertEqual(version, 1) self.assertIsNotNone(loss) hvd.shutdown()
def test_training_process_with_fault_tolerance(self): self._trainer.init_horovod_if_needed() features = tf.constant([[0.5], [0.6], [0.7]]) labels = tf.constant([[1.0], [0.0], [1.0]]) version, _ = self._trainer.training_process_with_fault_tolerance( features, labels) # Firstly, we will call model locally to create variables # for the model and optimizer. Then, we will train the # model using Horovod. So, the iteration step = 2. self.assertEqual(version, 2) hvd.shutdown()
def stop_train(rank): # hvd shutdown logging.info('hvd shutdown at rank %d', rank) hvd.shutdown()
def test_static(self): mpi_rank, mpi_size = mpi_env_rank_and_size() gloo_rank = int(os.getenv('HOROVOD_RANK', -1)) gloo_size = int(os.getenv('HOROVOD_SIZE', -1)) is_mpi = gloo_rank == -1 rank = max(mpi_rank, gloo_rank) size = max(mpi_size, gloo_size) # This test does not apply if there is only one worker. if size == 1: self.skipTest("Only one worker available") if is_mpi: try: import mpi4py mpi4py.rc.initialize = False except ImportError: pass if rank == 0: my_process_sets = [ hvd.ProcessSet([0]), hvd.ProcessSet(range(1, size)), hvd.ProcessSet(range(size - 1, -1, -1)), # duplicate hvd.ProcessSet([0]) # duplicate ] else: my_process_sets = [ hvd.ProcessSet([0]), hvd.ProcessSet(reversed(range( 1, size))), # permuting a process set does not matter hvd.ProcessSet(range(size - 1, -1, -1)), # duplicate hvd.ProcessSet([0]) # duplicate ] with self.assertRaises(ValueError): hvd.init(process_sets=my_process_sets) if rank == 0: my_process_sets = [ hvd.ProcessSet([0]), hvd.ProcessSet(range(1, size)), ] else: my_process_sets = [ hvd.ProcessSet([0]), hvd.ProcessSet(reversed(range( 1, size))), # permuting a process set does not matter ] hvd.init(process_sets=my_process_sets) self.assertEqual(hvd.global_process_set.process_set_id, 0) self.assertListEqual(hvd.global_process_set.ranks, list(range(size))) # Here we test some implementation details (numeric process set id values) using an internal function. ps = hvd.mpi_ops._basics._get_process_set_ids_and_ranks() self.assertDictEqual(ps, { 0: list(range(size)), 1: [0], 2: list(range(1, size)) }) # If another process initiates shutdown while this process is still processing _get_process_set_ids_and_ranks(), # a race condition may be triggered. Avoid with a barrier. try: if is_mpi: # barrier before shutdown from mpi4py import MPI MPI.COMM_WORLD.barrier() else: time.sleep(0.1) except ImportError: time.sleep(0.1) hvd.shutdown()
def main(unused_argv): # Horovod: initialize Horovod. hvd.init() # Keras automatically creates a cache directory in ~/.keras/datasets for # storing the downloaded MNIST data. This creates a race # condition among the workers that share the same filesystem. If the # directory already exists by the time this worker gets around to creating # it, ignore the resulting exception and continue. cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets') if not os.path.exists(cache_dir): try: os.mkdir(cache_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(cache_dir): pass else: raise # Download and load MNIST dataset. #(train_data, train_labels), (eval_data, eval_labels) = \ # keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank()) f = np.load("/opt/ml/input/data/training/mnist.npz") train_data, train_labels = f['x_train'], f['y_train'] eval_data, eval_labels = f['x_test'], f['y_test'] print('train_labels: ', train_labels[0], 'rank: ', hvd.rank()) # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it # into (-1, 784) to feed into our network. Also, need to normalize the # features between 0 and 1. train_data = np.reshape(train_data, (-1, 784)) / 255.0 eval_data = np.reshape(eval_data, (-1, 784)) / 255.0 # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) starttime = time.time() # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. model_dir = '/opt/ml/model/' if hvd.rank() == 0 else None # Create the Estimator mnist_classifier = tf.estimator.Estimator( model_fn=cnn_model_fn, model_dir=model_dir, config=tf.estimator.RunConfig(session_config=config)) # Set up logging for predictions # Log the values in the "Softmax" tensor with label "probabilities" tensors_to_log = {"probabilities": "softmax_tensor"} logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log, every_n_iter=1000) # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from # rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights or # restored from a checkpoint. bcast_hook = hvd.BroadcastGlobalVariablesHook(0) # Train the model train_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": train_data}, y=train_labels, batch_size=100, num_epochs=None, shuffle=True) # Horovod: adjust number of steps based on number of GPUs. mnist_classifier.train(input_fn=train_input_fn, steps=1000 // hvd.size(), hooks=[logging_hook, bcast_hook]) # Evaluate the model and print results eval_input_fn = tf.estimator.inputs.numpy_input_fn(x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False) eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn) print(eval_results) print('total time: ', time.time() - starttime, 'rank: ', hvd.rank()) print(hvd.rank(), " shutdown") hvd.shutdown()
def test_multi_comm(self): gloo_size = int(os.getenv('HOROVOD_SIZE', -1)) if gloo_size != -1: self.skipTest("This test is specific to MPI and does not apply with Gloo controller.") try: from mpi4py import MPI except ImportError: self.skipTest("This test requires mpi4py.") # This will be our baseline world communicator comm = MPI.COMM_WORLD size = comm.size if size < 2: self.skipTest("This test requires multiple workers.") # Split COMM_WORLD into subcommunicators subcomm = MPI.COMM_WORLD.Split(color=MPI.COMM_WORLD.rank % 2, key=MPI.COMM_WORLD.rank) comm_clone = comm.Dup() subcomm_clone = subcomm.Dup() subcomm_effective_clone = hvd.ProcessSet(range(0, comm.size, 2)) # identified as a clone on even ranks # 3+ duplicates my_process_sets = [hvd.ProcessSet(subcomm), hvd.ProcessSet(comm_clone), hvd.ProcessSet(subcomm_clone), subcomm_effective_clone, hvd.ProcessSet([0]), ] with self.assertRaises(ValueError): hvd.init(comm=comm, process_sets=my_process_sets) ## Internally Horovod has been initialized successfully, but we need to call hvd.init() with a valid list of ## process sets to proceed. # 2+ duplicates my_process_sets = [hvd.ProcessSet(subcomm), hvd.ProcessSet(comm_clone), subcomm_effective_clone, hvd.ProcessSet([0]), ] with self.assertRaises(ValueError): hvd.init(comm=comm, process_sets=my_process_sets) # 1+ duplicates my_process_sets = [hvd.ProcessSet(subcomm), hvd.ProcessSet(comm_clone), hvd.ProcessSet([0]), ] with self.assertRaises(ValueError): hvd.init(comm=comm, process_sets=my_process_sets) # 1+ duplicates my_process_sets = [hvd.ProcessSet(subcomm), subcomm_effective_clone, hvd.ProcessSet([0]), ] if hvd.size() == 2 or hvd.rank() % 2 == 0: with self.assertRaises(ValueError): hvd.init(comm=comm, process_sets=my_process_sets) else: hvd.init(comm=comm, process_sets=my_process_sets) # no duplicates if size > 2: my_process_sets = [hvd.ProcessSet(subcomm), hvd.ProcessSet([0]), ] hvd.init(comm=comm, process_sets=my_process_sets) else: my_process_sets = [hvd.ProcessSet(subcomm), ] hvd.init(comm=comm, process_sets=my_process_sets) self.assertEqual(hvd.global_process_set.process_set_id, 0) self.assertListEqual(hvd.global_process_set.ranks, list(range(size))) self.assertEqual(hvd.global_process_set.mpi_comm, comm) # Here we test some implementation details (numeric process set id values) using an internal function. ps = hvd.mpi_ops._basics._get_process_set_ids_and_ranks() if size > 2: self.assertDictEqual(ps, {0: list(range(size)), 1: list(range(0, size, 2)), 2: list(range(1, size, 2)), 3: [0], }) else: self.assertDictEqual(ps, {0: list(range(size)), 1: list(range(0, size, 2)), 2: list(range(1, size, 2)), }) if hvd.rank() % 2 == 0: self.assertEqual(my_process_sets[0].process_set_id, 1) else: self.assertEqual(my_process_sets[0].process_set_id, 2) # If another process initiates shutdown while this process is still processing _get_process_set_ids_and_ranks(), # a race condition may be triggered. Avoid with a barrier. MPI.COMM_WORLD.barrier() hvd.shutdown()
def test_horovod_linear_regression(self): import horovod.tensorflow as hvd # Horovod: initialize Horovod. hvd.init() logdir = args.logdir + '/horovod_test/' if hvd.rank() == 0: if not os.path.exists(logdir): os.makedirs(logdir) assert args.training_size % hvd.size() == 0 assert (args.training_size // hvd.size()) % args.batch_size == 0 training_data_filename = logdir + 'training_data.npy' if hvd.rank() == 0: with open(training_data_filename, 'w') as f: full_training_data = np.random.random( size=(args.training_size, )) full_training_data.tofile(f) print("Full training data:") print(full_training_data) hvd.allgather(tf.constant([0])) with open(training_data_filename, 'r') as f: training_data = np.fromfile(f) training_data_size = training_data.shape[0] local_training_data_size = (training_data.shape[0] + hvd.size() - 1) // hvd.size() local_training_data_begin = hvd.rank() * local_training_data_size if hvd.rank() == hvd.size( ) - 1 and training_data.shape[0] % local_training_data_size != 0: local_training_data_size = training_data.shape[ 0] % local_training_data_size local_training_data = training_data[ local_training_data_begin:local_training_data_begin + local_training_data_size] print("Local training data:") print(local_training_data) # Define Tensorflow graph graph = tf.Graph() with graph.as_default(): x_ph = tf.placeholder(tf.float32, shape=[None, training_data_size], name='x') y_ph = tf.placeholder(tf.float32, shape=[None], name='y') w = tf.Variable(np.zeros((training_data_size, )), dtype=tf.float32, name='w') loss_func = tf.constant(0.5) * tf.reduce_sum( tf.square(y_ph - tf.tensordot(x_ph, w, axes=1))) opt = tf.train.GradientDescentOptimizer(learning_rate=hvd.size() * 1.0) # Horovod: wrap local optimizer in distributed Horovod optimizer. opt = hvd.DistributedOptimizer(opt) #train_step = opt.minimize(loss_func) #grads_and_vars = opt.compute_gradients(loss_func) train_step = opt.minimize( loss_func) # apply_gradients(grads_and_vars) config = tf.ConfigProto() config.intra_op_parallelism_threads = 22 config.inter_op_parallelism_threads = 8 config.gpu_options.allow_growth = True config.allow_soft_placement = True # Horovod # Add variable initializer. init = tf.global_variables_initializer() # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. bcast = hvd.broadcast_global_variables(0) print("Local training data:") print(local_training_data) print('Opening tf.Session...') with tf.Session(graph=graph, config=config) as sess: # We must initialize all variables before we use them. init.run() bcast.run() print('Initialized all Horovod ranks.') print('Begin training - batch size = {}.'.format(args.batch_size), flush=True) if hvd.rank() == 0: training_writer = tf.summary.FileWriter(logdir, graph=sess.graph) for i in range((local_training_data_size + args.batch_size - 1) // args.batch_size): batch_begin = i * args.batch_size batch_size = args.batch_size \ if i != (local_training_data_size+args.batch_size-1)//args.batch_size else \ local_training_data_size % args.batch_size x = np.zeros(shape=(batch_size, training_data_size)) x[np.arange(x.shape[0]), local_training_data_begin + batch_begin + np.arange(x.shape[0])] = 1.0 y = local_training_data[batch_begin:batch_begin + batch_size] feed_train = {x_ph: x, y_ph: y} # Only compute shuffle indices, do not compute shuffled data set to avoid memory error time_before = time.time() #grad_op = grads_and_vars[0][0] loss, _ = sess.run([loss_func, train_step], feed_dict=feed_train) time_after = time.time() print( 'Step {0:5d} - loss: {1:6.2f} - latency: {2:6.2f} ms.'. format(i, loss, 1000 * (time_after - time_before)), flush=True) print('Finished training - local residual w - y_training is:') local_residual = sess.run([ w[local_training_data_begin:local_training_data_begin + local_training_data_size] - y_ph ], feed_dict={y_ph: local_training_data}) print(local_residual) print('Locally trained variable components') print( sess.run([ w[local_training_data_begin:local_training_data_begin + local_training_data_size] ])) print('Local training data') print(local_training_data) self.assertTrue( np.allclose(local_residual, np.zeros(len(local_residual), ), rtol=1e-7)) if hvd.rank() == 0: os.remove(training_data_filename) hvd.allgather(tf.constant([0])) hvd.shutdown()
def run_gnn(args,model_ops,test_items,train_items=None,optimizer=None): # Split ops. inputs_p_ph, inputs_l_ph, targets_ph, inputs_p_op, inputs_l_op, targets_op, output_ops, loss_op, step_op = model_ops # Create new TF session. banner_print("Create TF config / session.") config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) config.gpu_options.allow_growth = True if args.hvd: import horovod.tensorflow as hvd config.gpu_options.visible_device_list = str(hvd.local_rank()) #Some versions of tensorflow(version<1.15) have issues with allocating all device memory #In this case, uncomment the following line #config.gpu_options.per_process_gpu_memory_fraction = 0.5 checkpoint_dir = './checkpoints' if RANK == 0 else './checkpoints_test' else: config.gpu_options.visible_device_list = str(0) checkpoint_dir = './checkpoints' try: os.mkdir(checkpoint_dir) except OSError: print ("Creation of directory %s failed!"%checkpoint_dir) else: print ("Successfully created directory %s."%checkpoint_dir) sess = tf.Session(config=config) # Initialize Model. if(RANK==0): print("All workers are initializing global variables...") #All ranks should initialize their variables before #loading checkpoints or getting broadcast variables #from rank 0 sess.run(tf.global_variables_initializer()) if(RANK==0): print("Done global variables init.") saver = tf.train.Saver() model_path = checkpoint_dir + '/model%s.ckpt'%RANK restore_path = model_path if RANK==0: # Test Save / Restore model with Rank 0. save_path = saver.save(sess, model_path) print("Coordinator Test checkpoint saved to: %s"%save_path) else: save_path = saver.save(sess, model_path) print("Worker test checkpoint saved to: %s"%save_path) if args.restore != None: restore_path = args.restore print("Restoring model from: %s"%restore_path) saver.restore(sess,restore_path) print("Model restored sucessfully.") print("To resume training use --restore %s"%str(os.getcwd()+"/"+restore_path)) else: restore_path=model_path saver.restore(sess, restore_path) print("Worker fresh checkpoint restore test success.") print("To resume training use --restore %s"%str(os.getcwd()+"/"+restore_path)) print("Training new model.") # Print total model parameters. if 0: total_parameters = 0 for variable in tf.trainable_variables(): variable_parameters = 1 for dim in variable.get_shape(): variable_parameters *= dim.value total_parameters += variable_parameters print("Total trainable params: ", total_parameters) if args.hvd: print("Broadcasting...") import horovod.tensorflow as hvd bcast_op = hvd.broadcast_global_variables(0) sess.run(bcast_op) time.sleep(10) print("Done broadcast") # Training / inference loop. banner_print("Start training / testing loop.") acc_best = 0.0 epoch_best = 0 log_epochs, solveds_tr, solveds_ge, losses_tr, losses_ge, lr_hist = ([],[],[],[],[],[]) for epoch in range(args.epochs): if RANK == 0: print("Epoch %d:"%(epoch)) log_epochs.append(epoch) # Run training step. if not INFERENCE_ONLY: if RANK == 0: print(" Training.") elapsed, solved, loss, count = run_batches(sess, lambda: item_batch_iter(train_items,args.batch_size), inputs_p_ph, inputs_l_ph, targets_ph, inputs_p_op, inputs_l_op, targets_op, output_ops, step_op, loss_op) acc = solved / count loss = loss / count lr = sess.run(optimizer._learning_rate) if args.hvd: acc, solved, loss = average_distributed_metrics(sess, acc, solved, loss) count = hvd.size()*count solved = hvd.size()*solved if RANK == 0: print(" Time: %.1fs"%(elapsed)) print(" LrnR: %.6f"%lr) print(" Loss: %f"%(loss)) print(" Acc.: %f (%.1f/%.1f)"%(acc,solved,count)) solveds_tr.append(acc) losses_tr.append(loss) lr_hist.append(lr) # Run a test step. if RANK == 0: print(" Testing.") elapsed, solved, loss, count = run_batches(sess, lambda: item_batch_iter(test_items,args.batch_size_test, shuffle=False), inputs_p_ph, inputs_l_ph, targets_ph, inputs_p_op, inputs_l_op, targets_op, output_ops, None, loss_op) acc = solved / count loss = loss / count if args.hvd: acc, solved, loss = average_distributed_metrics(sess, acc, solved, loss) count = hvd.size()*count solved = hvd.size()*solved if RANK == 0: print(" Time: %.1fs"%(elapsed)) print(" Loss: %f"%(loss)) print(" Acc.: %f (%.1f/%.1f)"%(acc,solved,count)) solveds_ge.append(acc) losses_ge.append(loss) if(args.plot_history): plot_history(log_epochs, solveds_tr, solveds_ge, 'PharML-Accuracy', 'accuracy') plot_history(log_epochs, losses_tr, losses_ge, 'PharML-Loss','loss') plot_history(log_epochs, lr_hist, lr_hist, 'PharML-LR','learning rate') # Checkpoint if needed. if acc > acc_best and not INFERENCE_ONLY: acc_best = acc epoch_best = epoch if RANK == 0: print(" New Best Test Acc: ", acc_best) print(" -> Occurred at epoch ", epoch_best) sys.stdout.flush() save_path = saver.save(sess, model_path) print(" -> Saved checkpoint to %s"%(save_path)) if INFERENCE_ONLY: # Exit loop after first inference if in inference only-mode. print("Inference only mode, done with single pass so exiting...") hvd.shutdown() break; # If test accuracy has not improved for more than # 15 epochs, call it converged and exit - this is what # was used in paper, but since we've found lowering # this to 5 epochs is sufficient in some cases if( (epoch-epoch_best) >= 15 and not INFERENCE_ONLY): print("Model Converged! Exiting Nicely...") #sys.exit(0) hvd.shutdown() break; # Success! banner_print("Success!")