def get_dataflow(config): # - Get train/test datasets if idist.get_rank() > 0: # Ensure that only rank 0 download the dataset idist.barrier() train_dataset, test_dataset = utils.get_train_test_datasets( config["data_path"], **{k: config[k] for k in ["rescale_size", "rand_aug", "rand_erasing"]}, ) if idist.get_rank() == 0: # Ensure that only rank 0 download the dataset idist.barrier() # Setup data loader also adapted to distributed config: nccl, gloo, xla-tpu train_loader = idist.auto_dataloader( train_dataset, batch_size=config["batch_size"], num_workers=config["num_workers"], shuffle=True, drop_last=True, ) test_loader = idist.auto_dataloader( test_dataset, batch_size=2 * config["batch_size"], num_workers=config["num_workers"], shuffle=False, ) return train_loader, test_loader
def __init__(self, logger: TrainsLogger = None, output_uri: str = None, dirname: str = None, *args, **kwargs): self._setup_check_trains(logger, output_uri) if not dirname: dirname = "" if idist.get_rank() == 0: dirname = tempfile.mkdtemp( prefix="ignite_checkpoints_{}".format( datetime.now().strftime("%Y_%m_%d_%H_%M_%S_"))) if idist.get_world_size() > 1: dirname = idist.all_gather(dirname)[0] warnings.warn( "TrainsSaver created a temporary checkpoints directory: {}". format(dirname)) idist.barrier() # Let's set non-atomic tmp dir saving behaviour if "atomic" not in kwargs: kwargs["atomic"] = False super(TrainsSaver, self).__init__(dirname=dirname, *args, **kwargs)
def get_dataflow(config): # - Get train/test datasets if idist.get_local_rank() > 0: # Ensure that only local rank 0 download the dataset # Thus each node will download a copy of the dataset idist.barrier() train_dataset, test_dataset = utils.get_train_test_datasets( config["data_path"]) if idist.get_local_rank() == 0: # Ensure that only local rank 0 download the dataset idist.barrier() # Setup data loader also adapted to distributed config: nccl, gloo, xla-tpu train_loader = idist.auto_dataloader( train_dataset, batch_size=config["batch_size"], num_workers=config["num_workers"], shuffle=True, drop_last=True, ) test_loader = idist.auto_dataloader( test_dataset, batch_size=2 * config["batch_size"], num_workers=config["num_workers"], shuffle=False, ) return train_loader, test_loader
def get_dataflow(config): # - Get train/test datasets if idist.get_rank() > 0: # Ensure that only rank 0 download the dataset idist.barrier() train_dataset, test_dataset = get_train_test_datasets(config.get("data_path", ".")) if idist.get_rank() == 0: # Ensure that only rank 0 download the dataset idist.barrier() # Setup data loader also adapted to distributed config: nccl, gloo, xla-tpu train_loader = idist.auto_dataloader( train_dataset, batch_size=config.get("batch_size", 512), num_workers=config.get("num_workers", 8), shuffle=True, drop_last=True, ) config["num_iters_per_epoch"] = len(train_loader) test_loader = idist.auto_dataloader( test_dataset, batch_size=2 * config.get("batch_size", 512), num_workers=config.get("num_workers", 8), shuffle=False, ) return train_loader, test_loader
def get_dataflow(config: ConfigSchema, wlm: WeakLabelManager) -> Dict[str, DataLoader]: # - Get train/test datasets if idist.get_rank() > 0: # Ensure that only rank 0 download the dataset idist.barrier() dataset = get_dataset(config.dataset, config.data_path) train_split = wlm.convert_targets(dataset["train"]) if idist.get_rank() == 0: # Ensure that only rank 0 download the dataset idist.barrier() # Setup data loader also adapted to distributed config: nccl, gloo, xla-tpu train_loader = idist.auto_dataloader( train_split, batch_size=config.batch_size, num_workers=config.num_workers, shuffle=True, drop_last=True, ) val_loader = idist.auto_dataloader( dataset["val"], batch_size=2 * config.batch_size, num_workers=config.num_workers, shuffle=False, ) test_loader = idist.auto_dataloader( dataset["test"], batch_size=2 * config.batch_size, num_workers=config.num_workers, shuffle=False, ) return {"train": train_loader, "val": val_loader, "test": test_loader}
def get_dataflow(config): # - Get train/test datasets if idist.get_rank() > 0: # Ensure that only rank 0 download the dataset idist.barrier() train_dataset, test_dataset = utils.get_train_test_datasets( config["data_path"]) if idist.get_rank() == 0: # Ensure that only rank 0 download the dataset idist.barrier() # Setup data loader also adapted to distributed config: nccl, gloo, xla-tpu train_loader = idist.auto_dataloader( train_dataset, batch_size=config["batch_size"], num_workers=config["num_workers"], shuffle=True, pin_memory="cuda" in idist.device().type, drop_last=True, ) test_loader = idist.auto_dataloader( test_dataset, batch_size=2 * config["batch_size"], num_workers=config["num_workers"], shuffle=False, pin_memory="cuda" in idist.device().type, ) return train_loader, test_loader
def __init__( self, logger: Optional[ClearMLLogger] = None, output_uri: Optional[str] = None, dirname: Optional[str] = None, *args: Any, **kwargs: Any, ) -> None: self._setup_check_clearml(logger, output_uri) if not dirname: dirname = "" if idist.get_rank() == 0: dirname = tempfile.mkdtemp(prefix=f"ignite_checkpoints_{datetime.now().strftime('%Y_%m_%d_%H_%M_%S_')}") if idist.get_world_size() > 1: dirname = idist.all_gather(dirname)[0] # type: ignore[index, assignment] warnings.warn(f"ClearMLSaver created a temporary checkpoints directory: {dirname}") idist.barrier() # Let's set non-atomic tmp dir saving behaviour if "atomic" not in kwargs: kwargs["atomic"] = False self._checkpoint_slots = defaultdict(list) # type: DefaultDict[Union[str, Tuple[str, str]], List[Any]] super(ClearMLSaver, self).__init__(dirname=dirname, *args, **kwargs) # type: ignore[misc]
def get_dataflow(config): # - Get train/test datasets if idist.get_rank() > 0: # Ensure that only rank 0 download the dataset idist.barrier() train_dataset, test_dataset = utils.get_dataset(config["data_dir"], config["model"], config["tokenizer_dir"], config["max_length"]) if idist.get_rank() == 0: # Ensure that only rank 0 download the dataset idist.barrier() # Setup data loader also adapted to distributed config: nccl, gloo, xla-tpu train_loader = idist.auto_dataloader( train_dataset, batch_size=config["batch_size"], num_workers=config["num_workers"], shuffle=True, drop_last=True, ) test_loader = idist.auto_dataloader( test_dataset, batch_size=2 * config["batch_size"], num_workers=config["num_workers"], shuffle=False, ) return train_loader, test_loader
def _test_distrib_barrier(device): t = torch.tensor([idist.get_rank()], device=device, dtype=torch.float) true_res = sum([i for i in range(idist.get_world_size())]) if idist.get_rank() == 0: t += 10.0 idist.barrier() tt = idist.all_reduce(t) assert tt.item() == true_res + 10.0
def get_datasets(path): local_rank = idist.get_local_rank() if local_rank > 0: # Ensure that only rank 0 download the dataset idist.barrier() train_ds = datasets.CIFAR10(root=path, train=True, download=True, transform=train_transform) eval_ds = datasets.CIFAR10(root=path, train=False, download=True, transform=eval_transform) if local_rank == 0: # Ensure that only rank 0 download the dataset idist.barrier() return train_ds, eval_ds
def get_datasets(*args, **kwargs): local_rank = idist.get_local_rank() if local_rank > 0: # Ensure that only rank 0 download the dataset idist.barrier() # CUSTOM DATASETS GO HERE train_dataset = ... eval_dataset = ... if local_rank == 0: # Ensure that only rank 0 download the dataset idist.barrier() return train_dataset, eval_dataset
def _test_tpu_saves_to_cpu(device, dirname): torch.manual_seed(0) h = ModelCheckpoint(dirname, _PREFIX) engine = Engine(lambda e, b: None) engine.state = State(epoch=0, iteration=1) model = DummyModel().to(device) to_save = {"model": model} h(engine, to_save) idist.barrier() fname = h.last_checkpoint assert isinstance(fname, str) assert os.path.join(dirname, _PREFIX) in fname assert os.path.exists(fname) loaded_objects = torch.load(fname) assert loaded_objects == model.cpu().state_dict()
def get_model_weights(config, logger, with_clearml): path = "" if with_clearml: from clearml import Model if idist.get_rank() > 0: idist.barrier() else: model_id = config.weights_path logger.info(f"Loading trained model: {model_id}") model = Model(model_id) assert model is not None, f"{model_id}" path = model.get_local_copy() idist.barrier() path = idist.broadcast(path, src=0) else: path = config.weights_path logger.info(f"Loading {path}") assert Path(path).exists(), f"{path} is not found" return torch.load(path)
def get_dataflow(config): # - Get train/test datasets if idist.get_local_rank() > 0: # Ensure that only rank 0 download the dataset idist.barrier() train_dataset, test_dataset = get_dataset( config.data_dir, config.model, config.tokenizer_dir, config.max_length ) if idist.get_local_rank() == 0: # Ensure that only rank 0 download the dataset idist.barrier() # Setup data loader also adapted to distributed config: nccl, gloo, xla-tpu train_loader = idist.auto_dataloader( train_dataset, batch_size=config.batch_size, num_workers=config.num_workers, shuffle=True, drop_last=True, {% if use_distributed_training and not use_distributed_launcher %} persistent_workers = True, {% endif %} ) test_loader = idist.auto_dataloader( test_dataset, batch_size=2 * config.batch_size, num_workers=config.num_workers, shuffle=False, {% if use_distributed_training and not use_distributed_launcher %} persistent_workers = True, {% endif %} ) return train_loader, test_loader
def _test_save_model_optimizer_lr_scheduler_with_state_dict( device, on_zero_rank=False): if idist.get_rank() == 0: clearml.Task.current_task = Mock(return_value=object()) clearml.binding.frameworks.WeightsFileHandler.create_output_model = MagicMock( ) torch.manual_seed(23) model = DummyModel().to(device) optim = torch.optim.SGD(model.parameters(), lr=0.1) lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optim, gamma=0.5) def update_fn(engine, batch): x = torch.rand((4, 2)).to(device) optim.zero_grad() y = model(x) # Below code raises: RuntimeError: torch_xla/csrc/tensor_impl.cpp:144 : XLA tensors do not have storage # Probably related to https://github.com/pytorch/xla/issues/2576 # loss = y.pow(2.0).sum() loss = y.sum() loss.backward() if idist.has_xla_support: import torch_xla.core.xla_model as xm xm.optimizer_step(optim, barrier=True) else: optim.step() lr_scheduler.step() engine = Engine(update_fn) to_save = { "model": model, "optimizer": optim, "lr_scheduler": lr_scheduler } with pytest.warns( UserWarning, match=r"ClearMLSaver created a temporary checkpoints directory"): clearml_saver = ClearMLSaver() if (not on_zero_rank) or (on_zero_rank and idist.get_rank() == 0): checkpoint = Checkpoint(to_save=to_save, save_handler=clearml_saver, n_saved=1) engine.add_event_handler(Events.EPOCH_COMPLETED, checkpoint) engine.run([0], max_epochs=4) idist.barrier() saved_objects = sorted(os.listdir(clearml_saver.dirname)) # saved object is ['PREFIX_checkpoint_3.pt', ] saved_checkpoint = os.path.join(clearml_saver.dirname, saved_objects[0]) if idist.has_xla_support: device = "cpu" loaded_obj = torch.load(saved_checkpoint, map_location=device) for f in ["model", "optimizer", "lr_scheduler"]: assert f in loaded_obj loaded_model_state_dict = loaded_obj["model"] loaded_optimizer_state_dict = loaded_obj["optimizer"] loaded_lr_scheduler_state_dict = loaded_obj["lr_scheduler"] assert isinstance(loaded_model_state_dict, dict) assert isinstance(loaded_optimizer_state_dict, dict) assert isinstance(loaded_lr_scheduler_state_dict, dict) # Specifically move device to CPU first model_state_dict = model.cpu().state_dict() for key in model_state_dict.keys(): assert key in loaded_model_state_dict model_value = model_state_dict[key] loaded_model_value = loaded_model_state_dict[key] assert (model_value.cpu().numpy() == loaded_model_value.cpu().numpy() ).all() optim_state_dict = optim.state_dict() for key in optim_state_dict.keys(): assert key in loaded_optimizer_state_dict optim_value = optim_state_dict[key] loaded_optim_value = loaded_optimizer_state_dict[key] if idist.get_rank() == 0: assert optim_value == loaded_optim_value lr_scheduler_state_dict = lr_scheduler.state_dict() for key in lr_scheduler_state_dict.keys(): assert key in loaded_lr_scheduler_state_dict lr_scheduler_value = lr_scheduler_state_dict[key] loaded_lr_scheduler_value = loaded_lr_scheduler_state_dict[key] assert lr_scheduler_value == loaded_lr_scheduler_value
def get_datasets(dataset, dataroot): """ Args: dataset (str): Name of the dataset to use. See CLI help for details dataroot (str): root directory where the dataset will be stored. Returns: dataset, num_channels """ local_rank = idist.get_local_rank() if local_rank > 0: # Ensure that only rank 0 download the dataset idist.barrier() resize = T.Resize(64) crop = T.CenterCrop(64) to_tensor = T.ToTensor() normalize = T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) if dataset in {"imagenet", "folder", "lfw"}: dataset = dset.ImageFolder(root=dataroot, transform=T.Compose( [resize, crop, to_tensor, normalize])) nc = 3 elif dataset == "lsun": dataset = dset.LSUN(root=dataroot, classes=["bedroom_train"], transform=T.Compose( [resize, crop, to_tensor, normalize])) nc = 3 elif dataset == "cifar10": dataset = dset.CIFAR10(root=dataroot, download=True, transform=T.Compose( [resize, to_tensor, normalize])) nc = 3 elif dataset == "mnist": dataset = dset.MNIST(root=dataroot, download=True, transform=T.Compose( [resize, to_tensor, normalize])) nc = 1 elif dataset == "fake": dataset = dset.FakeData(size=256, image_size=(3, 64, 64), transform=to_tensor) nc = 3 else: raise RuntimeError(f"Invalid dataset name: {dataset}") if local_rank == 0: # Ensure that only rank 0 download the dataset idist.barrier() return dataset, nc
def _test_save_model_optimizer_lr_scheduler_with_state_dict( device, dirname, on_zero_rank=False): torch.manual_seed(23) model = DummyModel().to(device) optim = torch.optim.SGD(model.parameters(), lr=0.1) lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optim, gamma=0.5) def update_fn(engine, batch): x = torch.rand((4, 1)).to(device) optim.zero_grad() y = model(x) loss = y.pow(2.0).sum() loss.backward() if idist.has_xla_support: import torch_xla.core.xla_model as xm xm.optimizer_step(optim, barrier=True) else: optim.step() lr_scheduler.step() engine = Engine(update_fn) if (not on_zero_rank) or (on_zero_rank and idist.get_rank() == 0): handler = ModelCheckpoint(dirname, _PREFIX, create_dir=True, n_saved=1) engine.add_event_handler(Events.EPOCH_COMPLETED, handler, { "model": model, "optimizer": optim, "lr_scheduler": lr_scheduler }) engine.run([0], max_epochs=4) idist.barrier() saved_objects = sorted(os.listdir(dirname)) # saved object is ['PREFIX_checkpoint_3.pt', ] saved_checkpoint = os.path.join(dirname, saved_objects[0]) if idist.has_xla_support: device = "cpu" loaded_obj = torch.load(saved_checkpoint, map_location=device) for f in ["model", "optimizer", "lr_scheduler"]: assert f in loaded_obj loaded_model_state_dict = loaded_obj["model"] loaded_optimizer_state_dict = loaded_obj["optimizer"] loaded_lr_scheduler_state_dict = loaded_obj["lr_scheduler"] assert isinstance(loaded_model_state_dict, dict) assert isinstance(loaded_optimizer_state_dict, dict) assert isinstance(loaded_lr_scheduler_state_dict, dict) # Specifically move device to CPU first model_state_dict = model.cpu().state_dict() for key in model_state_dict.keys(): assert key in loaded_model_state_dict model_value = model_state_dict[key] loaded_model_value = loaded_model_state_dict[key] assert model_value.cpu().numpy() == loaded_model_value.cpu().numpy() optim_state_dict = optim.state_dict() for key in optim_state_dict.keys(): assert key in loaded_optimizer_state_dict optim_value = optim_state_dict[key] loaded_optim_value = loaded_optimizer_state_dict[key] if idist.get_rank() == 0: assert optim_value == loaded_optim_value lr_scheduler_state_dict = lr_scheduler.state_dict() for key in lr_scheduler_state_dict.keys(): assert key in loaded_lr_scheduler_state_dict lr_scheduler_value = lr_scheduler_state_dict[key] loaded_lr_scheduler_value = loaded_lr_scheduler_state_dict[key] assert lr_scheduler_value == loaded_lr_scheduler_value
def train(args): logging.basicConfig(stream=sys.stdout, level=logging.INFO) # initialize the distributed training process, every GPU runs in a process dist.init_process_group(backend="nccl", init_method="env://") if idist.get_local_rank() == 0 and not os.path.exists(args.dir): # create 40 random image, mask paris for training print(f"generating synthetic data to {args.dir} (this may take a while)") os.makedirs(args.dir) # set random seed to generate same random data for every node np.random.seed(seed=0) for i in range(40): im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1) n = nib.Nifti1Image(im, np.eye(4)) nib.save(n, os.path.join(args.dir, f"img{i:d}.nii.gz")) n = nib.Nifti1Image(seg, np.eye(4)) nib.save(n, os.path.join(args.dir, f"seg{i:d}.nii.gz")) idist.barrier() images = sorted(glob(os.path.join(args.dir, "img*.nii.gz"))) segs = sorted(glob(os.path.join(args.dir, "seg*.nii.gz"))) train_files = [{"image": img, "label": seg} for img, seg in zip(images, segs)] # define transforms for image and segmentation train_transforms = Compose( [ LoadImaged(keys=["image", "label"]), AsChannelFirstd(keys=["image", "label"], channel_dim=-1), ScaleIntensityd(keys="image"), RandCropByPosNegLabeld( keys=["image", "label"], label_key="label", spatial_size=[96, 96, 96], pos=1, neg=1, num_samples=4 ), RandRotate90d(keys=["image", "label"], prob=0.5, spatial_axes=[0, 2]), EnsureTyped(keys=["image", "label"]), ] ) # create a training data loader train_ds = Dataset(data=train_files, transform=train_transforms) # create a training data sampler train_sampler = DistributedSampler(train_ds) # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training train_loader = DataLoader( train_ds, batch_size=2, shuffle=False, num_workers=2, pin_memory=True, sampler=train_sampler, ) # create UNet, DiceLoss and Adam optimizer device = torch.device(f"cuda:{idist.get_local_rank()}") torch.cuda.set_device(device) net = monai.networks.nets.UNet( spatial_dims=3, in_channels=1, out_channels=1, channels=(16, 32, 64, 128, 256), strides=(2, 2, 2, 2), num_res_units=2, ).to(device) loss = monai.losses.DiceLoss(sigmoid=True) opt = torch.optim.Adam(net.parameters(), 1e-3) lr_scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=2, gamma=0.1) # wrap the model with DistributedDataParallel module net = DistributedDataParallel(net, device_ids=[device]) train_post_transforms = Compose( [ EnsureTyped(keys="pred"), Activationsd(keys="pred", sigmoid=True), AsDiscreted(keys="pred", threshold=0.5), KeepLargestConnectedComponentd(keys="pred", applied_labels=[1]), ] ) train_handlers = [ LrScheduleHandler(lr_scheduler=lr_scheduler, print_lr=True), ] if idist.get_rank() == 0: train_handlers.extend( [ StatsHandler(tag_name="train_loss", output_transform=from_engine(["loss"], first=True)), CheckpointSaver(save_dir="./runs/", save_dict={"net": net, "opt": opt}, save_interval=2), ] ) trainer = SupervisedTrainer( device=device, max_epochs=5, train_data_loader=train_loader, network=net, optimizer=opt, loss_function=loss, inferer=SimpleInferer(), # if no FP16 support in GPU or PyTorch version < 1.6, will not enable AMP evaluation amp=True if monai.utils.get_torch_version_tuple() >= (1, 6) else False, postprocessing=train_post_transforms, key_train_metric={"train_acc": Accuracy(output_transform=from_engine(["pred", "label"]), device=device)}, train_handlers=train_handlers, ) trainer.run() dist.destroy_process_group()