Exemplo n.º 1
0
def get_dataflow(config):
    # - Get train/test datasets
    if idist.get_local_rank() > 0:
        # Ensure that only local rank 0 download the dataset
        # Thus each node will download a copy of the dataset
        idist.barrier()

    train_dataset, test_dataset = utils.get_train_test_datasets(
        config["data_path"])

    if idist.get_local_rank() == 0:
        # Ensure that only local rank 0 download the dataset
        idist.barrier()

    # Setup data loader also adapted to distributed config: nccl, gloo, xla-tpu
    train_loader = idist.auto_dataloader(
        train_dataset,
        batch_size=config["batch_size"],
        num_workers=config["num_workers"],
        shuffle=True,
        drop_last=True,
    )

    test_loader = idist.auto_dataloader(
        test_dataset,
        batch_size=2 * config["batch_size"],
        num_workers=config["num_workers"],
        shuffle=False,
    )
    return train_loader, test_loader
Exemplo n.º 2
0
def _test_distrib_config(local_rank, backend, ws, true_device, rank=None):
    assert idist.backend() == backend, f"{idist.backend()} vs {backend}"

    this_device = idist.device()
    assert isinstance(this_device, torch.device)
    if backend in ("nccl", "horovod") and "cuda" in this_device.type:
        true_device = torch.device(f"{true_device}:{local_rank}")
        assert this_device == true_device, f"{this_device} vs {true_device}"
    elif backend in ("gloo", "horovod"):
        assert this_device == torch.device(true_device)
    elif backend == "xla-tpu":
        assert true_device in this_device.type

    if rank is None:
        if idist.model_name() == "native-dist":
            rank = dist.get_rank()

    if rank is not None:
        assert idist.get_rank() == rank

    assert idist.get_world_size() == ws
    assert idist.get_local_rank() == local_rank

    assert idist.model_name() in ("native-dist", "xla-dist", "horovod-dist")

    _sanity_check()
Exemplo n.º 3
0
def test_no_distrib(capsys):

    assert idist.backend() is None
    if torch.cuda.is_available():
        assert idist.device().type == "cuda"
    else:
        assert idist.device().type == "cpu"
    assert idist.get_rank() == 0
    assert idist.get_world_size() == 1
    assert idist.get_local_rank() == 0
    assert idist.model_name() == "serial"

    from ignite.distributed.utils import _model, _SerialModel

    _sanity_check()
    assert isinstance(_model, _SerialModel)

    idist.show_config()
    captured = capsys.readouterr()
    out = captured.err.split("\r")
    out = list(map(lambda x: x.strip(), out))
    out = list(filter(None, out))
    assert "ignite.distributed.utils INFO: distributed configuration: serial" in out[
        -1]
    assert "ignite.distributed.utils INFO: backend: None" in out[-1]
    if torch.cuda.is_available():
        assert "ignite.distributed.utils INFO: device: cuda" in out[-1]
    else:
        assert "ignite.distributed.utils INFO: device: cpu" in out[-1]
    assert "ignite.distributed.utils INFO: rank: 0" in out[-1]
    assert "ignite.distributed.utils INFO: local rank: 0" in out[-1]
    assert "ignite.distributed.utils INFO: world size: 1" in out[-1]
Exemplo n.º 4
0
def _test_distrib_config(local_rank, backend, ws, true_device, rank=None):
    assert idist.backend() == backend, "{} vs {}".format(
        idist.backend(), backend)

    this_device = idist.device()
    assert isinstance(this_device, torch.device)
    if backend == "nccl":
        true_device = torch.device("{}:{}".format(true_device, local_rank))
        assert this_device == true_device, "{} vs {}".format(
            this_device, true_device)
    elif backend == "gloo":
        assert this_device == torch.device(true_device)
    elif backend == "xla-tpu":
        assert true_device in this_device.type

    if rank is None:
        if idist.model_name() == "native-dist":
            rank = dist.get_rank()
            assert idist.get_rank() == rank

    assert idist.get_world_size() == ws
    assert idist.get_local_rank() == local_rank

    assert idist.model_name() in ("native-dist", "xla-dist")

    _sanity_check()
Exemplo n.º 5
0
def _test_func(index, ws, device, backend, true_init_method):
    assert 0 <= index < ws
    assert index == idist.get_local_rank()
    assert ws == idist.get_world_size()
    assert torch.device(device).type == idist.device().type
    assert backend == idist.backend()

    if idist.model_name() == "native-dist":
        from ignite.distributed.utils import _model

        assert _model._init_method == true_init_method
Exemplo n.º 6
0
def log_basic_info(logger: Logger, config: Any) -> None:
    """Logging about pytorch, ignite, configurations, gpu system
    distributed settings.

    Parameters
    ----------
    logger
        Logger instance for logging
    config
        config object to log
    """
    import ignite

    logger.info("PyTorch version: %s", torch.__version__)
    logger.info("Ignite version: %s", ignite.__version__)
    if torch.cuda.is_available():
        # explicitly import cudnn as
        # torch.backends.cudnn can not be pickled with hvd spawning procs
        from torch.backends import cudnn

        logger.info("GPU device: %s", torch.cuda.get_device_name(idist.get_local_rank()))
        logger.info("CUDA version: %s", torch.version.cuda)
        logger.info("CUDNN version: %s", cudnn.version())

    logger.info("Configuration: %s", pformat(vars(config)))

    if idist.get_world_size() > 1:
        logger.info("distributed configuration: %s", idist.model_name())
        logger.info("backend: %s", idist.backend())
        logger.info("device: %s", idist.device().type)
        logger.info("hostname: %s", idist.hostname())
        logger.info("world size: %s", idist.get_world_size())
        logger.info("rank: %s", idist.get_rank())
        logger.info("local rank: %s", idist.get_local_rank())
        logger.info("num processes per node: %s", idist.get_nproc_per_node())
        logger.info("num nodes: %s", idist.get_nnodes())
        logger.info("node rank: %s", idist.get_node_rank())
Exemplo n.º 7
0
def get_datasets(path):
    local_rank = idist.get_local_rank()

    if local_rank > 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    train_ds = datasets.CIFAR10(root=path, train=True, download=True, transform=train_transform)
    eval_ds = datasets.CIFAR10(root=path, train=False, download=True, transform=eval_transform)

    if local_rank == 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    return train_ds, eval_ds
Exemplo n.º 8
0
def log_basic_info(logger, config):

    msg = "\n- PyTorch version: {}".format(torch.__version__)
    msg += "\n- Ignite version: {}".format(ignite.__version__)
    msg += "\n- Cuda device name: {}".format(
        torch.cuda.get_device_name(idist.get_local_rank()))

    logger.info(msg)

    if idist.get_world_size() > 1:
        msg = "\nDistributed setting:"
        msg += "\tbackend: {}".format(idist.backend())
        msg += "\trank: {}".format(idist.get_rank())
        msg += "\tworld size: {}".format(idist.get_world_size())
        logger.info(msg)
Exemplo n.º 9
0
def get_dataflow(config):
    # - Get train/test datasets
    if idist.get_local_rank() > 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    train_dataset, test_dataset = get_dataset(
        config.data_dir, config.model, config.tokenizer_dir, config.max_length
    )

    if idist.get_local_rank() == 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    # Setup data loader also adapted to distributed config: nccl, gloo, xla-tpu
    train_loader = idist.auto_dataloader(
        train_dataset,
        batch_size=config.batch_size,
        num_workers=config.num_workers,
        shuffle=True,
        drop_last=True,
        {% if use_distributed_training and not use_distributed_launcher %}
        persistent_workers = True,
        {% endif %}
    )

    test_loader = idist.auto_dataloader(
        test_dataset,
        batch_size=2 * config.batch_size,
        num_workers=config.num_workers,
        shuffle=False,
        {% if use_distributed_training and not use_distributed_launcher %}
    persistent_workers = True,
        {% endif %}
    )
    return train_loader, test_loader
Exemplo n.º 10
0
def training(local_rank, config, **kwargs):

    import time

    time.sleep(idist.get_rank() * 0.1)

    print(idist.get_rank(), ": run with config:", config, "- kwargs:", kwargs,
          f"- backend={idist.backend()}")

    t = torch.tensor([idist.get_rank()], device=idist.device())
    t = idist.all_reduce(t)
    t = t.item()
    ws = idist.get_world_size()
    assert t == ws * (ws - 1) / 2, f"{t} vs {ws}"
    assert local_rank == idist.get_local_rank()
Exemplo n.º 11
0
def get_datasets(*args, **kwargs):
    local_rank = idist.get_local_rank()

    if local_rank > 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    # CUSTOM DATASETS GO HERE
    train_dataset = ...
    eval_dataset = ...

    if local_rank == 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    return train_dataset, eval_dataset
Exemplo n.º 12
0
def training(local_rank, config, **kwargs):

    import time

    time.sleep(idist.get_rank() * 0.1)

    print(idist.get_rank(), ": run with config:", config, "- kwargs:", kwargs, f"- backend={idist.backend()}")

    t = torch.tensor([idist.get_rank()], device=idist.device())
    t = idist.all_reduce(t)
    t = t.item()
    ws = idist.get_world_size()
    assert t == ws * (ws - 1) / 2, f"{t} vs {ws}"
    assert local_rank == idist.get_local_rank()

    # Test init method:
    if idist.model_name() == "native-dist":
        from ignite.distributed.utils import _model

        true_init_method = config.get("true_init_method", None)
        assert true_init_method is not None, true_init_method
        assert _model._init_method == true_init_method
Exemplo n.º 13
0
def get_datasets(dataset, dataroot):
    """

    Args:
        dataset (str): Name of the dataset to use. See CLI help for details
        dataroot (str): root directory where the dataset will be stored.

    Returns:
        dataset, num_channels
    """
    local_rank = idist.get_local_rank()

    if local_rank > 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    resize = T.Resize(64)
    crop = T.CenterCrop(64)
    to_tensor = T.ToTensor()
    normalize = T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))

    if dataset in {"imagenet", "folder", "lfw"}:
        dataset = dset.ImageFolder(root=dataroot,
                                   transform=T.Compose(
                                       [resize, crop, to_tensor, normalize]))
        nc = 3

    elif dataset == "lsun":
        dataset = dset.LSUN(root=dataroot,
                            classes=["bedroom_train"],
                            transform=T.Compose(
                                [resize, crop, to_tensor, normalize]))
        nc = 3

    elif dataset == "cifar10":
        dataset = dset.CIFAR10(root=dataroot,
                               download=True,
                               transform=T.Compose(
                                   [resize, to_tensor, normalize]))
        nc = 3

    elif dataset == "mnist":
        dataset = dset.MNIST(root=dataroot,
                             download=True,
                             transform=T.Compose(
                                 [resize, to_tensor, normalize]))
        nc = 1

    elif dataset == "fake":
        dataset = dset.FakeData(size=256,
                                image_size=(3, 64, 64),
                                transform=to_tensor)
        nc = 3

    else:
        raise RuntimeError(f"Invalid dataset name: {dataset}")

    if local_rank == 0:
        # Ensure that only rank 0 download the dataset
        idist.barrier()

    return dataset, nc
Exemplo n.º 14
0
def train(args):
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    # initialize the distributed training process, every GPU runs in a process
    dist.init_process_group(backend="nccl", init_method="env://")

    if idist.get_local_rank() == 0 and not os.path.exists(args.dir):
        # create 40 random image, mask paris for training
        print(f"generating synthetic data to {args.dir} (this may take a while)")
        os.makedirs(args.dir)
        # set random seed to generate same random data for every node
        np.random.seed(seed=0)
        for i in range(40):
            im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1)
            n = nib.Nifti1Image(im, np.eye(4))
            nib.save(n, os.path.join(args.dir, f"img{i:d}.nii.gz"))
            n = nib.Nifti1Image(seg, np.eye(4))
            nib.save(n, os.path.join(args.dir, f"seg{i:d}.nii.gz"))
    idist.barrier()

    images = sorted(glob(os.path.join(args.dir, "img*.nii.gz")))
    segs = sorted(glob(os.path.join(args.dir, "seg*.nii.gz")))
    train_files = [{"image": img, "label": seg} for img, seg in zip(images, segs)]

    # define transforms for image and segmentation
    train_transforms = Compose(
        [
            LoadImaged(keys=["image", "label"]),
            AsChannelFirstd(keys=["image", "label"], channel_dim=-1),
            ScaleIntensityd(keys="image"),
            RandCropByPosNegLabeld(
                keys=["image", "label"], label_key="label", spatial_size=[96, 96, 96], pos=1, neg=1, num_samples=4
            ),
            RandRotate90d(keys=["image", "label"], prob=0.5, spatial_axes=[0, 2]),
            EnsureTyped(keys=["image", "label"]),
        ]
    )

    # create a training data loader
    train_ds = Dataset(data=train_files, transform=train_transforms)
    # create a training data sampler
    train_sampler = DistributedSampler(train_ds)
    # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training
    train_loader = DataLoader(
        train_ds,
        batch_size=2,
        shuffle=False,
        num_workers=2,
        pin_memory=True,
        sampler=train_sampler,
    )

    # create UNet, DiceLoss and Adam optimizer
    device = torch.device(f"cuda:{idist.get_local_rank()}")
    torch.cuda.set_device(device)
    net = monai.networks.nets.UNet(
        spatial_dims=3,
        in_channels=1,
        out_channels=1,
        channels=(16, 32, 64, 128, 256),
        strides=(2, 2, 2, 2),
        num_res_units=2,
    ).to(device)
    loss = monai.losses.DiceLoss(sigmoid=True)
    opt = torch.optim.Adam(net.parameters(), 1e-3)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=2, gamma=0.1)
    # wrap the model with DistributedDataParallel module
    net = DistributedDataParallel(net, device_ids=[device])

    train_post_transforms = Compose(
        [
            EnsureTyped(keys="pred"),
            Activationsd(keys="pred", sigmoid=True),
            AsDiscreted(keys="pred", threshold=0.5),
            KeepLargestConnectedComponentd(keys="pred", applied_labels=[1]),
        ]
    )
    train_handlers = [
        LrScheduleHandler(lr_scheduler=lr_scheduler, print_lr=True),
    ]
    if idist.get_rank() == 0:
        train_handlers.extend(
            [
                StatsHandler(tag_name="train_loss", output_transform=from_engine(["loss"], first=True)),
                CheckpointSaver(save_dir="./runs/", save_dict={"net": net, "opt": opt}, save_interval=2),
            ]
        )

    trainer = SupervisedTrainer(
        device=device,
        max_epochs=5,
        train_data_loader=train_loader,
        network=net,
        optimizer=opt,
        loss_function=loss,
        inferer=SimpleInferer(),
        # if no FP16 support in GPU or PyTorch version < 1.6, will not enable AMP evaluation
        amp=True if monai.utils.get_torch_version_tuple() >= (1, 6) else False,
        postprocessing=train_post_transforms,
        key_train_metric={"train_acc": Accuracy(output_transform=from_engine(["pred", "label"]), device=device)},
        train_handlers=train_handlers,
    )
    trainer.run()
    dist.destroy_process_group()