示例#1
0
    def test_failed_run(self):

        def fn():
            hvd.init()
            rank = hvd.rank()
            if rank == 1:
                raise RuntimeError()

        with pytest.raises(RuntimeError, match='Gloo job detected that one or more processes exited'):
            run(fn, np=2, use_gloo=True)

        with pytest.raises(RuntimeError, match='mpirun failed'):
            run(fn, np=2, use_mpi=True)
示例#2
0
    def test_failed_run(self):
        def fn():
            hvd.init()
            rank = hvd.rank()
            if rank == 1:
                raise RuntimeError()

        assert gloo_built() or mpi_built()

        if gloo_built():
            with pytest.raises(
                    RuntimeError,
                    match='Horovod detected that one or more processes exited'
            ):
                run(fn, np=2, use_gloo=True)

        if mpi_built():
            with pytest.raises(RuntimeError, match='mpirun failed'):
                run(fn, np=2, use_mpi=True)
示例#3
0
    def test_happy_run(self):

        def fn(a, b, c, d):
            hvd.init()
            rank = hvd.rank()
            v = a + b + c + d
            res = hvd.allgather(torch.tensor([rank, v])).tolist()
            if rank == 0:
                return res
            elif rank == 1:
                return "ret_val_of_rank_1"
            else:
                return None

        for use_gloo, use_mpi in [(True, False), (False, True)]:
            res1 = run(fn, (1, 20), {"c": 300, "d": 4000}, np=1, use_gloo=use_gloo, use_mpi=use_mpi)
            self.assertListEqual([[0, 4321]], res1)
            res2 = run(fn, (1, 20), {"c": 300, "d": 4000}, np=3, use_gloo=use_gloo, use_mpi=use_mpi)
            self.assertListEqual([[0, 4321, 1, 4321, 2, 4321],
                                  "ret_val_of_rank_1",
                                  None], res2)