def _run(self, rank): self.rank = rank try: dist.init_process_group(init_method=INIT_METHOD, backend=BACKEND, world_size=int(WORLD_SIZE), rank=self.rank) except RuntimeError as e: if "recompile" in e.args[0]: sys.exit(SKIP_IF_BACKEND_UNAVAILABLE) # sys.exit(0) raise # self.id() == e.g. '__main__.TestDistributed.test_get_rank' # We're retreiving a corresponding test and executing it. getattr(self, self.id().split(".")[2])() sys.exit(0)
or first_process.exitcode == SKIP_IF_NO_CUDA_EXIT_CODE or first_process.exitcode == SKIP_IF_NO_GPU_EXIT_CODE or first_process.exitcode == SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE) if first_process.exitcode == SKIP_IF_NO_CUDA_EXIT_CODE: raise unittest.SkipTest("cuda is not available") if first_process.exitcode == SKIP_IF_NO_GPU_EXIT_CODE: raise unittest.SkipTest( "One unique gpu per process is not available") if first_process.exitcode == SKIP_IF_SMALL_WORLDSIZE_EXIT_CODE: raise unittest.SkipTest( "worldsize is too small to run group tests") self.assertEqual(first_process.exitcode, 0) elif BACKEND == "mpi": WORLD_SIZE = os.environ["WORLD_SIZE"] dist.init_process_group(init_method=INIT_METHOD, backend="mpi") class TestMPI(TestCase, _DistTestBase): pass if __name__ == "__main__": assert ( not torch.cuda._initialized ), "test_distributed must not have initialized CUDA context on main process" unittest.main()