def test_horovod_adasum_multiple_allreduce_cpu(self):
        """Test on CPU that the Adasum correctly computes 2D tensors."""
        hvd.init()
        # TODO support non-MPI Adasum operation
        if not hvd.mpi_enabled():
            self.skipTest("MPI not enabled")

        size = hvd.size()
        # TODO support testing with non-power 2 ranks
        if not is_power2(size):
            self.skipTest("MPI rank is not power of 2")

        rank = hvd.rank()
        rank_tensors = []
        for _ in range(size):
            rank_tensors.append([
                np.random.random_sample((2, 2)),
                np.random.random_sample((2, 2))
            ])
        answer = reference_tree_reduction(rank_tensors, size)

        for dtype in [tf.float16, tf.float32, tf.float64]:
            with tf.device("/cpu:0"):
                tensors = map(tf.constant, rank_tensors[rank])
                # cast to the corresponding dtype
                tensors = map(lambda tensor: tf.cast(tensor, dtype), tensors)
                # and away we go: do reduction
                reduced_tensors = [
                    self.evaluate(hvd.allreduce(tensor, op=hvd.Adasum))
                    for tensor in tensors
                ]
                # cast expected result to the type of the tensorflow values
                np_type = dtype.as_numpy_dtype
                tmp = [t.astype(np_type) for t in answer]
                self.assertAllCloseAccordingToType(tmp, reduced_tensors)
    def test_horovod_adasum_multiple_allreduce_gpu_nccl(self):
        """Test on GPU using NCCL that the Adasum correctly computes 2D tensors."""
        hvd.init()
        # TODO support non-MPI Adasum operation
        if not hvd.mpi_enabled() or not hvd.gpu_available(
                'tensorflow') or not hvd.nccl_built():
            self.skipTest("MPI, GPU or NCCL not available")

        rank = hvd.rank()
        rank_tensors = []
        size = hvd.size()
        # TODO support testing with non-power 2 ranks
        if not is_power2(size):
            self.skipTest("MPI rank is not power of 2")

        local_size = hvd.local_size()

        # Only run on homogeneous cluster
        if not hvd.is_homogeneous():
            self.skipTest("Horovod cluster is not homogeneous")

        num_nodes = int(size / local_size)
        for _ in range(size):
            rank_tensors.append([
                np.random.random_sample((2, 2)),
                np.random.random_sample((2, 2))
            ])
        sum_local_ranks_tensor = []
        for i in range(num_nodes):
            sum_local_ranks_tensor.append([np.zeros((2, 2)), np.zeros((2, 2))])
            for j in range(local_size):
                sum_local_ranks_tensor[i] = np.add(sum_local_ranks_tensor[i],
                                                   rank_tensors[j])

        answer = reference_tree_reduction(sum_local_ranks_tensor, num_nodes)
        answer = np.true_divide(answer, local_size)
        for dtype in [tf.float16, tf.float32, tf.float64]:
            with tf.device("/gpu:{}".format(hvd.local_rank())):
                tensors = map(tf.constant, rank_tensors[rank])
                # cast to the corresponding dtype
                tensors = map(lambda tensor: tf.cast(tensor, dtype), tensors)
                # and away we go: do reduction
                reduced_tensors = [
                    self.evaluate(hvd.allreduce(tensor, op=hvd.Adasum))
                    for tensor in tensors
                ]
                # cast expected result to the type of the tensorflow values
                np_type = dtype.as_numpy_dtype
                tmp = [t.astype(np_type) for t in answer]
                self.assertAllCloseAccordingToType(tmp, reduced_tensors)