def test_xla_distrib_single_node_no_spawn(): idist.initialize("xla-tpu") _test_distrib_config(local_rank=0, backend="xla-tpu", ws=1, true_device="xla") idist.finalize()
def _test_idist_methods_in_hvd_context(backend, device): # We explicitly set _model as _SerialModel # then call idist.* methods and check that they give correct values import horovod.torch as hvd from ignite.distributed.utils import _SerialModel, _set_model hvd.init() _set_model(_SerialModel()) ws = hvd.size() rank = hvd.rank() local_rank = hvd.local_rank() if torch.cuda.is_available(): torch.cuda.set_device(local_rank) _test_distrib_config(local_rank, backend=backend, ws=ws, true_device=device, rank=rank) hvd.shutdown()
def test_idist_methods_in_xla_context(): # We explicitly set _model as _SerialModel # then call idist.* methods and check that they give correct values from ignite.distributed.utils import _set_model, _SerialModel _set_model(_SerialModel()) _test_distrib_config(local_rank=0, backend="xla-tpu", ws=1, true_device="xla", rank=0)
def test_native_distrib_single_node_launch_tool_nccl(local_rank, world_size): import os rank = local_rank os.environ["RANK"] = "{}".format(rank) idist.initialize("nccl") _test_distrib_config(local_rank, "nccl", world_size, "cuda", rank) idist.finalize()
def _test_idist_methods_in_native_context(backend, device, local_rank): # We explicitly set _model as _SerialModel # then call idist.* methods and check that they give correct values from ignite.distributed.utils import _SerialModel, _set_model _set_model(_SerialModel()) ws = dist.get_world_size() rank = dist.get_rank() _test_distrib_config(local_rank, backend=backend, ws=ws, true_device=device, rank=rank)
def test_native_distrib_single_node_launch_tool_gloo(local_rank, world_size): import os from datetime import timedelta timeout = timedelta(seconds=20) rank = local_rank os.environ["RANK"] = "{}".format(rank) idist.initialize("gloo", timeout=timeout) _test_distrib_config(local_rank, "gloo", world_size, "cpu", rank) idist.finalize()
def test_hvd_distrib_single_node_single_device(): import horovod.torch as hvd idist.initialize("horovod") device = "cpu" if torch.cuda.device_count() < 1 else "cuda" local_rank = hvd.local_rank() world_size = hvd.size() rank = hvd.rank() _test_distrib_config(local_rank, "horovod", world_size, device, rank) idist.finalize()
def _test_idist_methods_in_xla_context_in_child_proc(index): # We explicitly set _model as _SerialModel # then call idist.* methods and check that they give correct values from ignite.distributed.utils import _set_model, _SerialModel _set_model(_SerialModel()) import torch_xla.core.xla_model as xm _test_distrib_config( local_rank=index, backend="xla-tpu", ws=xm.xrt_world_size(), true_device="xla", rank=xm.get_ordinal() )
def _test_idist_methods_in_native_context_set_local_rank(backend, device, local_rank): # We explicitly set _model as _SerialModel # then call idist.* methods and check that they give correct values from ignite.distributed.utils import _SerialModel, _set_model _set_model(_SerialModel()) lrank = int(os.environ["LOCAL_RANK"]) del os.environ["LOCAL_RANK"] ws = dist.get_world_size() rank = dist.get_rank() idist.set_local_rank(local_rank) _test_distrib_config(local_rank=local_rank, backend=backend, ws=ws, true_device=device, rank=rank) os.environ["LOCAL_RANK"] = str(lrank)
def _test_native_distrib_single_node_launch_tool(backend, device, local_rank, world_size, init_method=None, **kwargs): import os rank = local_rank os.environ["RANK"] = f"{rank}" idist.initialize(backend, init_method=init_method, **kwargs) _test_distrib_config(local_rank, backend, world_size, device, rank, true_init_method=init_method) idist.finalize()