def test_horovod_adasum_multiple_allreduce_cpu(self): """Test on CPU that the Adasum correctly computes 2D tensors.""" hvd.init() # TODO support non-MPI Adasum operation if not hvd.mpi_enabled(): self.skipTest("MPI not enabled") size = hvd.size() # TODO support testing with non-power 2 ranks if not is_power2(size): self.skipTest("MPI rank is not power of 2") rank = hvd.rank() rank_tensors = [] for _ in range(size): rank_tensors.append([ np.random.random_sample((2, 2)), np.random.random_sample((2, 2)) ]) answer = reference_tree_reduction(rank_tensors, size) for dtype in [tf.float16, tf.float32, tf.float64]: with tf.device("/cpu:0"): tensors = map(tf.constant, rank_tensors[rank]) # cast to the corresponding dtype tensors = map(lambda tensor: tf.cast(tensor, dtype), tensors) # and away we go: do reduction reduced_tensors = [ self.evaluate(hvd.allreduce(tensor, op=hvd.Adasum)) for tensor in tensors ] # cast expected result to the type of the tensorflow values np_type = dtype.as_numpy_dtype tmp = [t.astype(np_type) for t in answer] self.assertAllCloseAccordingToType(tmp, reduced_tensors)
def test_horovod_adasum_multiple_allreduce_gpu_nccl(self): """Test on GPU using NCCL that the Adasum correctly computes 2D tensors.""" hvd.init() # TODO support non-MPI Adasum operation if not hvd.mpi_enabled() or not hvd.gpu_available( 'tensorflow') or not hvd.nccl_built(): self.skipTest("MPI, GPU or NCCL not available") rank = hvd.rank() rank_tensors = [] size = hvd.size() # TODO support testing with non-power 2 ranks if not is_power2(size): self.skipTest("MPI rank is not power of 2") local_size = hvd.local_size() # Only run on homogeneous cluster if not hvd.is_homogeneous(): self.skipTest("Horovod cluster is not homogeneous") num_nodes = int(size / local_size) for _ in range(size): rank_tensors.append([ np.random.random_sample((2, 2)), np.random.random_sample((2, 2)) ]) sum_local_ranks_tensor = [] for i in range(num_nodes): sum_local_ranks_tensor.append([np.zeros((2, 2)), np.zeros((2, 2))]) for j in range(local_size): sum_local_ranks_tensor[i] = np.add(sum_local_ranks_tensor[i], rank_tensors[j]) answer = reference_tree_reduction(sum_local_ranks_tensor, num_nodes) answer = np.true_divide(answer, local_size) for dtype in [tf.float16, tf.float32, tf.float64]: with tf.device("/gpu:{}".format(hvd.local_rank())): tensors = map(tf.constant, rank_tensors[rank]) # cast to the corresponding dtype tensors = map(lambda tensor: tf.cast(tensor, dtype), tensors) # and away we go: do reduction reduced_tensors = [ self.evaluate(hvd.allreduce(tensor, op=hvd.Adasum)) for tensor in tensors ] # cast expected result to the type of the tensorflow values np_type = dtype.as_numpy_dtype tmp = [t.astype(np_type) for t in answer] self.assertAllCloseAccordingToType(tmp, reduced_tensors)