예제 #1
0
    def test_failed_run(self):
        def fn():
            hvd.init()
            rank = hvd.rank()
            if rank == 1:
                raise RuntimeError()
            # The other worker waits a while before exiting.
            time.sleep(120)

        assert gloo_built() or mpi_built()

        start = time.time()

        if gloo_built():
            with pytest.raises(
                    RuntimeError,
                    match='Horovod detected that one or more processes exited'
            ):
                run(fn, np=2, use_gloo=True)

        if mpi_built():
            with pytest.raises(RuntimeError, match='mpirun failed'):
                run(fn, np=2, use_mpi=True)

        # The controller should be terminating workers way before the 2-minute delay.
        assert time.time() - start < 60
예제 #2
0
    def test_happy_run(self):
        def fn(a, b, c, d):
            hvd.init()
            rank = hvd.rank()
            v = a + b + c + d
            res = hvd.allgather(torch.tensor([rank, v])).tolist()
            if rank == 0:
                return res
            elif rank == 1:
                return "ret_val_of_rank_1"
            else:
                return None

        assert gloo_built() or mpi_built()
        for use_gloo, use_mpi in [(True, False), (False, True)]:
            if use_mpi and not mpi_built():
                continue

            if use_gloo and not gloo_built():
                continue

            res1 = run(fn, (1, 20), {"c": 300, "d": 4000}, np=1, use_gloo=use_gloo, use_mpi=use_mpi)
            self.assertListEqual([[0, 4321]], res1)
            res2 = run(fn, (1, 20), {"c": 300, "d": 4000}, np=3, use_gloo=use_gloo, use_mpi=use_mpi)
            self.assertListEqual([[0, 4321, 1, 4321, 2, 4321],
                                  "ret_val_of_rank_1",
                                  None], res2)
예제 #3
0
    def test_failed_run(self):
        def fn():
            hvd.init()
            rank = hvd.rank()
            if rank == 1:
                raise RuntimeError()

        assert gloo_built() or mpi_built()

        if gloo_built():
            with pytest.raises(RuntimeError, match='Horovod detected that one or more processes exited'):
                run(fn, np=2, use_gloo=True)

        if mpi_built():
            with pytest.raises(RuntimeError, match='mpirun failed'):
                run(fn, np=2, use_mpi=True)