def get_dataflow(config): # - Get train/test datasets if idist.get_local_rank() > 0: # Ensure that only local rank 0 download the dataset # Thus each node will download a copy of the dataset idist.barrier() train_dataset, test_dataset = utils.get_train_test_datasets( config["data_path"]) if idist.get_local_rank() == 0: # Ensure that only local rank 0 download the dataset idist.barrier() # Setup data loader also adapted to distributed config: nccl, gloo, xla-tpu train_loader = idist.auto_dataloader( train_dataset, batch_size=config["batch_size"], num_workers=config["num_workers"], shuffle=True, drop_last=True, ) test_loader = idist.auto_dataloader( test_dataset, batch_size=2 * config["batch_size"], num_workers=config["num_workers"], shuffle=False, ) return train_loader, test_loader
def _test_distrib_config(local_rank, backend, ws, true_device, rank=None): assert idist.backend() == backend, f"{idist.backend()} vs {backend}" this_device = idist.device() assert isinstance(this_device, torch.device) if backend in ("nccl", "horovod") and "cuda" in this_device.type: true_device = torch.device(f"{true_device}:{local_rank}") assert this_device == true_device, f"{this_device} vs {true_device}" elif backend in ("gloo", "horovod"): assert this_device == torch.device(true_device) elif backend == "xla-tpu": assert true_device in this_device.type if rank is None: if idist.model_name() == "native-dist": rank = dist.get_rank() if rank is not None: assert idist.get_rank() == rank assert idist.get_world_size() == ws assert idist.get_local_rank() == local_rank assert idist.model_name() in ("native-dist", "xla-dist", "horovod-dist") _sanity_check()
def test_no_distrib(capsys): assert idist.backend() is None if torch.cuda.is_available(): assert idist.device().type == "cuda" else: assert idist.device().type == "cpu" assert idist.get_rank() == 0 assert idist.get_world_size() == 1 assert idist.get_local_rank() == 0 assert idist.model_name() == "serial" from ignite.distributed.utils import _model, _SerialModel _sanity_check() assert isinstance(_model, _SerialModel) idist.show_config() captured = capsys.readouterr() out = captured.err.split("\r") out = list(map(lambda x: x.strip(), out)) out = list(filter(None, out)) assert "ignite.distributed.utils INFO: distributed configuration: serial" in out[ -1] assert "ignite.distributed.utils INFO: backend: None" in out[-1] if torch.cuda.is_available(): assert "ignite.distributed.utils INFO: device: cuda" in out[-1] else: assert "ignite.distributed.utils INFO: device: cpu" in out[-1] assert "ignite.distributed.utils INFO: rank: 0" in out[-1] assert "ignite.distributed.utils INFO: local rank: 0" in out[-1] assert "ignite.distributed.utils INFO: world size: 1" in out[-1]
def _test_distrib_config(local_rank, backend, ws, true_device, rank=None): assert idist.backend() == backend, "{} vs {}".format( idist.backend(), backend) this_device = idist.device() assert isinstance(this_device, torch.device) if backend == "nccl": true_device = torch.device("{}:{}".format(true_device, local_rank)) assert this_device == true_device, "{} vs {}".format( this_device, true_device) elif backend == "gloo": assert this_device == torch.device(true_device) elif backend == "xla-tpu": assert true_device in this_device.type if rank is None: if idist.model_name() == "native-dist": rank = dist.get_rank() assert idist.get_rank() == rank assert idist.get_world_size() == ws assert idist.get_local_rank() == local_rank assert idist.model_name() in ("native-dist", "xla-dist") _sanity_check()
def _test_func(index, ws, device, backend, true_init_method): assert 0 <= index < ws assert index == idist.get_local_rank() assert ws == idist.get_world_size() assert torch.device(device).type == idist.device().type assert backend == idist.backend() if idist.model_name() == "native-dist": from ignite.distributed.utils import _model assert _model._init_method == true_init_method
def log_basic_info(logger: Logger, config: Any) -> None: """Logging about pytorch, ignite, configurations, gpu system distributed settings. Parameters ---------- logger Logger instance for logging config config object to log """ import ignite logger.info("PyTorch version: %s", torch.__version__) logger.info("Ignite version: %s", ignite.__version__) if torch.cuda.is_available(): # explicitly import cudnn as # torch.backends.cudnn can not be pickled with hvd spawning procs from torch.backends import cudnn logger.info("GPU device: %s", torch.cuda.get_device_name(idist.get_local_rank())) logger.info("CUDA version: %s", torch.version.cuda) logger.info("CUDNN version: %s", cudnn.version()) logger.info("Configuration: %s", pformat(vars(config))) if idist.get_world_size() > 1: logger.info("distributed configuration: %s", idist.model_name()) logger.info("backend: %s", idist.backend()) logger.info("device: %s", idist.device().type) logger.info("hostname: %s", idist.hostname()) logger.info("world size: %s", idist.get_world_size()) logger.info("rank: %s", idist.get_rank()) logger.info("local rank: %s", idist.get_local_rank()) logger.info("num processes per node: %s", idist.get_nproc_per_node()) logger.info("num nodes: %s", idist.get_nnodes()) logger.info("node rank: %s", idist.get_node_rank())
def get_datasets(path): local_rank = idist.get_local_rank() if local_rank > 0: # Ensure that only rank 0 download the dataset idist.barrier() train_ds = datasets.CIFAR10(root=path, train=True, download=True, transform=train_transform) eval_ds = datasets.CIFAR10(root=path, train=False, download=True, transform=eval_transform) if local_rank == 0: # Ensure that only rank 0 download the dataset idist.barrier() return train_ds, eval_ds
def log_basic_info(logger, config): msg = "\n- PyTorch version: {}".format(torch.__version__) msg += "\n- Ignite version: {}".format(ignite.__version__) msg += "\n- Cuda device name: {}".format( torch.cuda.get_device_name(idist.get_local_rank())) logger.info(msg) if idist.get_world_size() > 1: msg = "\nDistributed setting:" msg += "\tbackend: {}".format(idist.backend()) msg += "\trank: {}".format(idist.get_rank()) msg += "\tworld size: {}".format(idist.get_world_size()) logger.info(msg)
def get_dataflow(config): # - Get train/test datasets if idist.get_local_rank() > 0: # Ensure that only rank 0 download the dataset idist.barrier() train_dataset, test_dataset = get_dataset( config.data_dir, config.model, config.tokenizer_dir, config.max_length ) if idist.get_local_rank() == 0: # Ensure that only rank 0 download the dataset idist.barrier() # Setup data loader also adapted to distributed config: nccl, gloo, xla-tpu train_loader = idist.auto_dataloader( train_dataset, batch_size=config.batch_size, num_workers=config.num_workers, shuffle=True, drop_last=True, {% if use_distributed_training and not use_distributed_launcher %} persistent_workers = True, {% endif %} ) test_loader = idist.auto_dataloader( test_dataset, batch_size=2 * config.batch_size, num_workers=config.num_workers, shuffle=False, {% if use_distributed_training and not use_distributed_launcher %} persistent_workers = True, {% endif %} ) return train_loader, test_loader
def training(local_rank, config, **kwargs): import time time.sleep(idist.get_rank() * 0.1) print(idist.get_rank(), ": run with config:", config, "- kwargs:", kwargs, f"- backend={idist.backend()}") t = torch.tensor([idist.get_rank()], device=idist.device()) t = idist.all_reduce(t) t = t.item() ws = idist.get_world_size() assert t == ws * (ws - 1) / 2, f"{t} vs {ws}" assert local_rank == idist.get_local_rank()
def get_datasets(*args, **kwargs): local_rank = idist.get_local_rank() if local_rank > 0: # Ensure that only rank 0 download the dataset idist.barrier() # CUSTOM DATASETS GO HERE train_dataset = ... eval_dataset = ... if local_rank == 0: # Ensure that only rank 0 download the dataset idist.barrier() return train_dataset, eval_dataset
def training(local_rank, config, **kwargs): import time time.sleep(idist.get_rank() * 0.1) print(idist.get_rank(), ": run with config:", config, "- kwargs:", kwargs, f"- backend={idist.backend()}") t = torch.tensor([idist.get_rank()], device=idist.device()) t = idist.all_reduce(t) t = t.item() ws = idist.get_world_size() assert t == ws * (ws - 1) / 2, f"{t} vs {ws}" assert local_rank == idist.get_local_rank() # Test init method: if idist.model_name() == "native-dist": from ignite.distributed.utils import _model true_init_method = config.get("true_init_method", None) assert true_init_method is not None, true_init_method assert _model._init_method == true_init_method
def get_datasets(dataset, dataroot): """ Args: dataset (str): Name of the dataset to use. See CLI help for details dataroot (str): root directory where the dataset will be stored. Returns: dataset, num_channels """ local_rank = idist.get_local_rank() if local_rank > 0: # Ensure that only rank 0 download the dataset idist.barrier() resize = T.Resize(64) crop = T.CenterCrop(64) to_tensor = T.ToTensor() normalize = T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) if dataset in {"imagenet", "folder", "lfw"}: dataset = dset.ImageFolder(root=dataroot, transform=T.Compose( [resize, crop, to_tensor, normalize])) nc = 3 elif dataset == "lsun": dataset = dset.LSUN(root=dataroot, classes=["bedroom_train"], transform=T.Compose( [resize, crop, to_tensor, normalize])) nc = 3 elif dataset == "cifar10": dataset = dset.CIFAR10(root=dataroot, download=True, transform=T.Compose( [resize, to_tensor, normalize])) nc = 3 elif dataset == "mnist": dataset = dset.MNIST(root=dataroot, download=True, transform=T.Compose( [resize, to_tensor, normalize])) nc = 1 elif dataset == "fake": dataset = dset.FakeData(size=256, image_size=(3, 64, 64), transform=to_tensor) nc = 3 else: raise RuntimeError(f"Invalid dataset name: {dataset}") if local_rank == 0: # Ensure that only rank 0 download the dataset idist.barrier() return dataset, nc
def train(args): logging.basicConfig(stream=sys.stdout, level=logging.INFO) # initialize the distributed training process, every GPU runs in a process dist.init_process_group(backend="nccl", init_method="env://") if idist.get_local_rank() == 0 and not os.path.exists(args.dir): # create 40 random image, mask paris for training print(f"generating synthetic data to {args.dir} (this may take a while)") os.makedirs(args.dir) # set random seed to generate same random data for every node np.random.seed(seed=0) for i in range(40): im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1) n = nib.Nifti1Image(im, np.eye(4)) nib.save(n, os.path.join(args.dir, f"img{i:d}.nii.gz")) n = nib.Nifti1Image(seg, np.eye(4)) nib.save(n, os.path.join(args.dir, f"seg{i:d}.nii.gz")) idist.barrier() images = sorted(glob(os.path.join(args.dir, "img*.nii.gz"))) segs = sorted(glob(os.path.join(args.dir, "seg*.nii.gz"))) train_files = [{"image": img, "label": seg} for img, seg in zip(images, segs)] # define transforms for image and segmentation train_transforms = Compose( [ LoadImaged(keys=["image", "label"]), AsChannelFirstd(keys=["image", "label"], channel_dim=-1), ScaleIntensityd(keys="image"), RandCropByPosNegLabeld( keys=["image", "label"], label_key="label", spatial_size=[96, 96, 96], pos=1, neg=1, num_samples=4 ), RandRotate90d(keys=["image", "label"], prob=0.5, spatial_axes=[0, 2]), EnsureTyped(keys=["image", "label"]), ] ) # create a training data loader train_ds = Dataset(data=train_files, transform=train_transforms) # create a training data sampler train_sampler = DistributedSampler(train_ds) # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training train_loader = DataLoader( train_ds, batch_size=2, shuffle=False, num_workers=2, pin_memory=True, sampler=train_sampler, ) # create UNet, DiceLoss and Adam optimizer device = torch.device(f"cuda:{idist.get_local_rank()}") torch.cuda.set_device(device) net = monai.networks.nets.UNet( spatial_dims=3, in_channels=1, out_channels=1, channels=(16, 32, 64, 128, 256), strides=(2, 2, 2, 2), num_res_units=2, ).to(device) loss = monai.losses.DiceLoss(sigmoid=True) opt = torch.optim.Adam(net.parameters(), 1e-3) lr_scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=2, gamma=0.1) # wrap the model with DistributedDataParallel module net = DistributedDataParallel(net, device_ids=[device]) train_post_transforms = Compose( [ EnsureTyped(keys="pred"), Activationsd(keys="pred", sigmoid=True), AsDiscreted(keys="pred", threshold=0.5), KeepLargestConnectedComponentd(keys="pred", applied_labels=[1]), ] ) train_handlers = [ LrScheduleHandler(lr_scheduler=lr_scheduler, print_lr=True), ] if idist.get_rank() == 0: train_handlers.extend( [ StatsHandler(tag_name="train_loss", output_transform=from_engine(["loss"], first=True)), CheckpointSaver(save_dir="./runs/", save_dict={"net": net, "opt": opt}, save_interval=2), ] ) trainer = SupervisedTrainer( device=device, max_epochs=5, train_data_loader=train_loader, network=net, optimizer=opt, loss_function=loss, inferer=SimpleInferer(), # if no FP16 support in GPU or PyTorch version < 1.6, will not enable AMP evaluation amp=True if monai.utils.get_torch_version_tuple() >= (1, 6) else False, postprocessing=train_post_transforms, key_train_metric={"train_acc": Accuracy(output_transform=from_engine(["pred", "label"]), device=device)}, train_handlers=train_handlers, ) trainer.run() dist.destroy_process_group()