def test_failed_run(self): def fn(): hvd.init() rank = hvd.rank() if rank == 1: raise RuntimeError() with pytest.raises(RuntimeError, match='Gloo job detected that one or more processes exited'): run(fn, np=2, use_gloo=True) with pytest.raises(RuntimeError, match='mpirun failed'): run(fn, np=2, use_mpi=True)
def test_failed_run(self): def fn(): hvd.init() rank = hvd.rank() if rank == 1: raise RuntimeError() assert gloo_built() or mpi_built() if gloo_built(): with pytest.raises( RuntimeError, match='Horovod detected that one or more processes exited' ): run(fn, np=2, use_gloo=True) if mpi_built(): with pytest.raises(RuntimeError, match='mpirun failed'): run(fn, np=2, use_mpi=True)
def test_happy_run(self): def fn(a, b, c, d): hvd.init() rank = hvd.rank() v = a + b + c + d res = hvd.allgather(torch.tensor([rank, v])).tolist() if rank == 0: return res elif rank == 1: return "ret_val_of_rank_1" else: return None for use_gloo, use_mpi in [(True, False), (False, True)]: res1 = run(fn, (1, 20), {"c": 300, "d": 4000}, np=1, use_gloo=use_gloo, use_mpi=use_mpi) self.assertListEqual([[0, 4321]], res1) res2 = run(fn, (1, 20), {"c": 300, "d": 4000}, np=3, use_gloo=use_gloo, use_mpi=use_mpi) self.assertListEqual([[0, 4321, 1, 4321, 2, 4321], "ret_val_of_rank_1", None], res2)