def test_failed_run(self): def fn(): hvd.init() rank = hvd.rank() if rank == 1: raise RuntimeError() # The other worker waits a while before exiting. time.sleep(120) assert gloo_built() or mpi_built() start = time.time() if gloo_built(): with pytest.raises( RuntimeError, match='Horovod detected that one or more processes exited' ): run(fn, np=2, use_gloo=True) if mpi_built(): with pytest.raises(RuntimeError, match='mpirun failed'): run(fn, np=2, use_mpi=True) # The controller should be terminating workers way before the 2-minute delay. assert time.time() - start < 60
def test_happy_run(self): def fn(a, b, c, d): hvd.init() rank = hvd.rank() v = a + b + c + d res = hvd.allgather(torch.tensor([rank, v])).tolist() if rank == 0: return res elif rank == 1: return "ret_val_of_rank_1" else: return None assert gloo_built() or mpi_built() for use_gloo, use_mpi in [(True, False), (False, True)]: if use_mpi and not mpi_built(): continue if use_gloo and not gloo_built(): continue res1 = run(fn, (1, 20), {"c": 300, "d": 4000}, np=1, use_gloo=use_gloo, use_mpi=use_mpi) self.assertListEqual([[0, 4321]], res1) res2 = run(fn, (1, 20), {"c": 300, "d": 4000}, np=3, use_gloo=use_gloo, use_mpi=use_mpi) self.assertListEqual([[0, 4321, 1, 4321, 2, 4321], "ret_val_of_rank_1", None], res2)
def test_failed_run(self): def fn(): hvd.init() rank = hvd.rank() if rank == 1: raise RuntimeError() assert gloo_built() or mpi_built() if gloo_built(): with pytest.raises(RuntimeError, match='Horovod detected that one or more processes exited'): run(fn, np=2, use_gloo=True) if mpi_built(): with pytest.raises(RuntimeError, match='mpirun failed'): run(fn, np=2, use_mpi=True)