Exemplo n.º 1
0
def test_segmentation_to_one_hot(use_gpu: bool, input_on_gpu: bool) -> None:
    # Settings to test on large scale:
    # B = 16
    # C = 2
    # dim = (50, 400, 400)
    B = 2
    C = 3
    dim = (4, 1, 2)
    input_size = (B, C) + dim
    actual_class = 5
    # This is deliberately replicated from get_datatype_for_image_tensors
    dtype = torch.float16 if is_gpu_available() else torch.float32
    device = "cuda" if input_on_gpu else "cpu"
    seg = torch.ones(input_size, dtype=torch.uint8, device=device) * actual_class
    start_time = time.time()
    one_hot = segmentation_to_one_hot(seg, use_gpu, result_dtype=dtype)
    elapsed = time.time() - start_time
    print(f"Computed one-hot in {elapsed:0.2f}sec")
    assert one_hot.shape == (B, C * HDF5_NUM_SEGMENTATION_CLASSES) + dim
    assert one_hot.dtype == dtype
    # The result must be on the same device as the input. In particular, that means we can feed in a CPU
    # tensor, do the computation on the GPU, and still get back a CPU tensor.
    assert seg.device == one_hot.device
    for i in range(C * HDF5_NUM_SEGMENTATION_CLASSES):
        # Dimensions 5, 15, 25 should be all ones
        if i % HDF5_NUM_SEGMENTATION_CLASSES == actual_class:
            expected = torch.ones((B,) + dim, device=one_hot.device)
            assert one_hot[:, i, ...].float().allclose(expected), f"Dimension {i} should have all ones"
        else:
            expected = torch.zeros((B,) + dim, device=one_hot.device)
            assert one_hot[:, i, ...].float().allclose(expected), f"Dimension {i} should have all ones"
Exemplo n.º 2
0
 def use_gpu(self) -> bool:  # type: ignore
     """
     Returns True if a CUDA capable GPU is present and should be used, False otherwise.
     """
     if self._use_gpu is None:
         # Use a local import here because we don't want the whole file to depend on pytorch.
         from InnerEye.ML.utils.ml_util import is_gpu_available
         self._use_gpu = is_gpu_available()
     return self._use_gpu
Exemplo n.º 3
0
 def use_gpu(self) -> bool:
     """
     Returns True if a GPU is available, and the self.max_num_gpus flag allows it to be used. Returns False
     otherwise (i.e., if there is no GPU available, or self.max_num_gpus==0)
     """
     if self.max_num_gpus == 0:
         return False
     from InnerEye.ML.utils.ml_util import is_gpu_available
     return is_gpu_available()
Exemplo n.º 4
0
 def use_gpu(self, value: bool) -> None:
     """
     Sets the flag that controls the use of the GPU. Raises a ValueError if the value is True, but no GPU is
     present.
     """
     if value:
         # Use a local import here because we don't want the whole file to depend on pytorch.
         from InnerEye.ML.utils.ml_util import is_gpu_available
         if not is_gpu_available():
             raise ValueError("Can't set use_gpu to True if there is not CUDA capable GPU present.")
     self._use_gpu = value
    def run(self) -> None:
        if self._interval_seconds <= 0:
            logging.warning(
                "Resource monitoring requires an interval that is larger than 0 seconds, but "
                "got: {}. Exiting.".format(self._interval_seconds))
        logging.info("Process ({}) started with pid: {}".format(
            self.name, self.pid))
        # create the TB writers and AML run context for this process
        writer = tensorboardX.SummaryWriter(self._tb_log_file_path)
        run_context = Run.get_context()
        is_offline_run = is_offline_run_context(run_context)
        current_iteration = 0

        def log_to_azure_and_tb(label: str, value: float) -> None:
            writer.add_scalar(label, value, current_iteration)
            if not is_offline_run:
                run_context.log(label, value)

        gpu_available = is_gpu_available()
        while True:
            if gpu_available:
                gpus: List[GPU] = GPUtil.getGPUs()
                if len(gpus) > 0:
                    for gpu in gpus:
                        log_to_azure_and_tb(
                            'Diagnostics/GPU_{}_Load_Percent'.format(gpu.id),
                            gpu.load * 100)
                        log_to_azure_and_tb(
                            'Diagnostics/GPU_{}_MemUtil_Percent'.format(
                                gpu.id), gpu.memoryUtil * 100)
                    # log the average GPU usage
                    log_to_azure_and_tb(
                        'Diagnostics/Average_GPU_Load_Percent',
                        statistics.mean(map(lambda x: x.load, gpus)) * 100)
                    log_to_azure_and_tb(
                        'Diagnostics/Average_GPU_MemUtil_Percent',
                        statistics.mean(map(lambda x: x.memoryUtil, gpus)) *
                        100)

            # log the CPU util
            log_to_azure_and_tb('Diagnostics/CPU_Util_Percent',
                                psutil.cpu_percent(interval=None))
            log_to_azure_and_tb('Diagnostics/CPU_MemUtil_Percent',
                                psutil.virtual_memory()[2])

            current_iteration += 1
            # pause the thread for the requested delay
            time.sleep(self._interval_seconds)
def load_checkpoint(model: torch.nn.Module,
                    path_to_checkpoint: Path,
                    optimizer: Optional[Optimizer] = None,
                    optimizer_to_gpu: Optional[bool] = False) -> Optional[int]:
    """
    Loads a checkpoint of a model.
    The epoch of the stored model and the epoch provided as argument must match.
    The provided model must match the stored model.

    :param model: The DataParallel object representing the network. Must have the same architecture of the stored model.
    :param path_to_checkpoint: The path to the checkpoint file.
    :param optimizer: The optimizer used for training
    :param optimizer_to_gpu: If true, move the optimizer to GPU, which we need to do if the model is also on GPU.
    :return: The checkpoint epoch if loaded and None if not loaded
    """

    if not path_to_checkpoint.is_file():
        logging.warning(
            f'No checkpoint found at {path_to_checkpoint} current working dir {os.getcwd()}'
        )
        return None

    logging.info(f"Loading checkpoint {path_to_checkpoint}")
    # For model debugging, allow loading a GPU trained model onto the CPU. This will clearly only work
    # if the model is small.
    map_location = None if is_gpu_available() else 'cpu'
    checkpoint = torch.load(str(path_to_checkpoint), map_location=map_location)

    if isinstance(model, torch.nn.DataParallel):
        model.module.load_state_dict(checkpoint['state_dict'])
    else:
        model.load_state_dict(checkpoint['state_dict'])

    if optimizer is not None:
        opt_dict = checkpoint['opt_dict']
        if optimizer_to_gpu:
            # https://github.com/pytorch/pytorch/issues/2830
            for key, val in opt_dict.items():
                if isinstance(val, torch.Tensor):
                    opt_dict[key] = val.cuda()
        optimizer.load_state_dict(opt_dict)

    logging.info("Loaded checkpoint (epoch: {})".format(checkpoint['epoch']))
    return checkpoint['epoch']
 def run(self) -> None:
     if self._interval_seconds <= 0:
         logging.warning(
             "Resource monitoring requires an interval that is larger than 0 seconds, but "
             f"got: {self._interval_seconds}. Exiting.")
         self.kill()
     logging.info(f"Process '{self.name}' started with pid: {self.pid}")
     gpu_available = is_gpu_available()
     while True:
         if gpu_available:
             self.update_metrics(GPUtil.getGPUs())
         # log the CPU utilization
         self.log_to_tensorboard('CPU/Load_Percent',
                                 psutil.cpu_percent(interval=None))
         self.log_to_tensorboard('CPU/MemUtil_Percent',
                                 psutil.virtual_memory()[2])
         self.step += 1
         self.store_to_file()
         # pause the thread for the requested delay
         time.sleep(self._interval_seconds)
Exemplo n.º 8
0
    config.local_dataset = Path()
    config.dataset_data_frame = pd.read_csv(StringIO(dataset_contents), sep=",", dtype=str)
    # Patch the load_images function that will be called once we access a dataset item
    image_and_seg = ImageAndSegmentations[np.ndarray](images=np.zeros(scan_size, dtype=np.float32),
                                                      segmentations=np.ones(scan_size, dtype=np.uint8))
    with mock.patch('InnerEye.ML.utils.io_util.load_image_in_known_formats', return_value=image_and_seg):
        azure_config = get_default_azure_config()
        azure_config.train = True
        MLRunner(config, azure_config).run()
        # No further asserts here because the models are still in experimental state. Most errors would come
        # from having invalid model architectures, which would throw runtime errors during training.
        # Verified manually that the cross entropy on the Val set that appears during training, and the
        # cross entropy when running on the Val set in test mode are the same.


@pytest.mark.parametrize("use_gpu", [True, False] if is_gpu_available() else [False])
@pytest.mark.parametrize("input_on_gpu", [True, False] if is_gpu_available() else [False])
@pytest.mark.gpu
def test_segmentation_to_one_hot(use_gpu: bool, input_on_gpu: bool) -> None:
    # Settings to test on large scale:
    # B = 16
    # C = 2
    # dim = (50, 400, 400)
    B = 2
    C = 3
    dim = (4, 1, 2)
    input_size = (B, C) + dim
    actual_class = 5
    # This is deliberately replicated from get_datatype_for_image_tensors
    dtype = torch.float16 if is_gpu_available() else torch.float32
    device = "cuda" if input_on_gpu else "cpu"
Exemplo n.º 9
0
#  ------------------------------------------------------------------------------------------
from typing import Any, List

import pytest
import torch
from torch import Tensor

from InnerEye.Common import common_util
from InnerEye.ML.models.architectures.base_model import BaseModel, CropSizeConstraints
from InnerEye.ML.models.losses.soft_dice import SoftDiceLoss
from InnerEye.ML.models.parallel.data_parallel import DataParallelCriterion
from InnerEye.ML.models.parallel.model_parallel import group_layers_with_balanced_memory, \
    move_to_device, partition_layers
from InnerEye.ML.utils.ml_util import is_gpu_available, set_random_seed

no_gpu = not is_gpu_available()
no_or_single_gpu = not torch.cuda.is_available(
) or torch.cuda.device_count() <= 1


class SimpleModel(BaseModel):
    """
    A simple neural network model to test model parallelisation functions. 
    """
    def __init__(self, input_channels: Any, channels: Any, n_classes: int,
                 kernel_size: int):
        # minimum crop size: Network first reduces size by 4, then halves, then multiplies by 2 and adds 1
        # 64 -> 62 -> 30 -> 61 -> 61
        super().__init__(
            name='SimpleModel',
            input_channels=input_channels,
Exemplo n.º 10
0
from InnerEye.Common.type_annotations import PathOrString, TupleInt3
from InnerEye.ML.config import SegmentationModelBase
from InnerEye.ML.dataset.full_image_dataset import PatientDatasetSource
from InnerEye.ML.dataset.sample import PatientMetadata, Sample
from InnerEye.ML.photometric_normalization import PhotometricNormalization
from InnerEye.ML.utils import io_util
from InnerEye.ML.utils.config_util import ModelConfigLoader
from InnerEye.ML.utils.io_util import ImageHeader, ImageWithHeader
from InnerEye.ML.utils.ml_util import is_gpu_available
from Tests.fixed_paths_for_tests import full_ml_test_data_path

TEST_CHANNEL_IDS = ["channel1", "channel2"]
TEST_MASK_ID = "mask"
TEST_GT_ID = "region"

machine_has_gpu = is_gpu_available()
no_gpu_available = not machine_has_gpu


def create_dataset_csv_file(csv_string: str, dst: str) -> Path:
    """Creates a dataset.csv in the destination path from the csv_string provided"""
    (Path(dst) / "dataset.csv").write_text(csv_string)
    return Path(dst)


def content_mismatch(actual: Any, expected: Any) -> str:
    """Returns error message for content mismatch."""
    return "Content mismatch. \nActual:\n {}\nExpected:\n {}".format(
        actual, expected)

Exemplo n.º 11
0
    with mock.patch("InnerEye.ML.run_ml.is_offline_run_context",
                    return_value=True):
        with mock.patch(
                'InnerEye.ML.utils.io_util.load_image_in_known_formats',
                return_value=image_and_seg):
            azure_config = get_default_azure_config()
            azure_config.train = True
            MLRunner(config, azure_config=azure_config).run()
            # No further asserts here because the models are still in experimental state. Most errors would come
            # from having invalid model architectures, which would throw runtime errors during training.
            # Verified manually that the cross entropy on the Val set that appears during training, and the
            # cross entropy when running on the Val set in test mode are the same.


@pytest.mark.parametrize("use_gpu",
                         [True, False] if is_gpu_available() else [False])
@pytest.mark.parametrize("input_on_gpu",
                         [True, False] if is_gpu_available() else [False])
@pytest.mark.gpu
def test_segmentation_to_one_hot(use_gpu: bool, input_on_gpu: bool) -> None:
    # Settings to test on large scale:
    # B = 16
    # C = 2
    # dim = (50, 400, 400)
    B = 2
    C = 3
    dim = (4, 1, 2)
    input_size = (B, C) + dim
    actual_class = 5
    # This is deliberately replicated from get_datatype_for_image_tensors
    dtype = torch.float16 if is_gpu_available() else torch.float32