def test_horovod_adasum_multiple_allreduce_gpu_nccl(self): """Test on GPU using NCCL that the Adasum correctly computes 2D tensors.""" hvd.init() # TODO support non-MPI Adasum operation if not hvd.mpi_enabled() or not hvd.gpu_available( 'tensorflow') or not hvd.nccl_built(): self.skipTest("MPI, GPU or NCCL not available") rank = hvd.rank() rank_tensors = [] size = hvd.size() # TODO support testing with non-power 2 ranks if not is_power2(size): self.skipTest("MPI rank is not power of 2") local_size = hvd.local_size() # Only run on homogeneous cluster if not hvd.is_homogeneous(): self.skipTest("Horovod cluster is not homogeneous") num_nodes = int(size / local_size) for _ in range(size): rank_tensors.append([ np.random.random_sample((2, 2)), np.random.random_sample((2, 2)) ]) sum_local_ranks_tensor = [] for i in range(num_nodes): sum_local_ranks_tensor.append([np.zeros((2, 2)), np.zeros((2, 2))]) for j in range(local_size): sum_local_ranks_tensor[i] = np.add(sum_local_ranks_tensor[i], rank_tensors[j]) answer = reference_tree_reduction(sum_local_ranks_tensor, num_nodes) answer = np.true_divide(answer, local_size) for dtype in [tf.float16, tf.float32, tf.float64]: with tf.device("/gpu:{}".format(hvd.local_rank())): tensors = map(tf.constant, rank_tensors[rank]) # cast to the corresponding dtype tensors = map(lambda tensor: tf.cast(tensor, dtype), tensors) # and away we go: do reduction reduced_tensors = [ self.evaluate(hvd.allreduce(tensor, op=hvd.Adasum)) for tensor in tensors ] # cast expected result to the type of the tensorflow values np_type = dtype.as_numpy_dtype tmp = [t.astype(np_type) for t in answer] self.assertAllCloseAccordingToType(tmp, reduced_tensors)
def log_final_result(value, error): if hvd.rank() > 0: return import horovod attrs = { 'framework': 'horovod', 'version': horovod.__version__, 'np': hvd.size(), 'bs': args.batch_size, 'model': args.model, } try: attrs['nccl_built'] = hvd.nccl_built() except: pass log_detailed_result(value, error, attrs)
def main(_): # Horovod: initialize Horovod. hvd.init() # Keras automatically creates a cache directory in ~/.keras/datasets for # storing the downloaded MNIST data. This creates a race # condition among the workers that share the same filesystem. If the # directory already exists by the time this worker gets around to creating # it, ignore the resulting exception and continue. cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets') if not os.path.exists(cache_dir): try: os.mkdir(cache_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(cache_dir): pass else: raise # Download and load MNIST dataset. (x_train, y_train), (x_test, y_test) = \ keras.datasets.mnist.load_data('MNIST-data-%d' % hvd.rank()) # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it # into (-1, 784) to feed into our network. Also, need to normalize the # features between 0 and 1. x_train = np.reshape(x_train, (-1, 784)) / 255.0 x_test = np.reshape(x_test, (-1, 784)) / 255.0 # Build model... with tf.name_scope('input'): image = tf.placeholder(tf.float32, [None, 784], name='image') label = tf.placeholder(tf.float32, [None], name='label') predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN) lr_scaler = hvd.size() # By default, Adasum doesn't need scaling when increasing batch size. If used with NCCL, # scale lr by local_size if args.use_adasum: lr_scaler = hvd.local_size() if hvd.nccl_built() else 1 # Horovod: adjust learning rate based on lr_scaler. opt = tf.train.AdamOptimizer(args.lr * lr_scaler) # Horovod: add Horovod Distributed Optimizer. opt = hvd.DistributedOptimizer( opt, op=hvd.Adasum if args.use_adasum else hvd.Average) global_step = tf.train.get_or_create_global_step() train_op = opt.minimize(loss, global_step=global_step) hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0), # Horovod: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=args.num_steps // hvd.size()), tf.train.LoggingTensorHook(tensors={ 'step': global_step, 'loss': loss }, every_n_iter=10), ] # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) # Horovod: save checkpoints only on worker 0 to prevent other workers from # corrupting them. checkpoint_dir = './checkpoints' if hvd.rank() == 0 else None training_batch_generator = train_input_generator(x_train, y_train, batch_size=100) # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop(): # Run a training step synchronously. image_, label_ = next(training_batch_generator) mon_sess.run(train_op, feed_dict={image: image_, label: label_})
else: os.environ["CUDA_VISIBLE_DEVICES"] = "-1" config.gpu_options.allow_growth = False config.gpu_options.visible_device_list = '' if args.eager: tf.enable_eager_execution(config) # Set up standard model. model = getattr(applications, args.model)(weights=None) lr_scaler = hvd.size() # By default, Adasum doesn't need scaling when increasing batch size. If used with NCCL, # scale lr by local_size if args.use_adasum: lr_scaler = hvd.local_size() if args.cuda and hvd.nccl_built() else 1 opt = tf.train.GradientDescentOptimizer(0.01 * lr_scaler) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. opt = hvd.DistributedOptimizer( opt, compression=compression, op=hvd.Adasum if args.use_adasum else hvd.Average) init = tf.global_variables_initializer() bcast_op = hvd.broadcast_global_variables(0)