Exemplo n.º 1
0
    def _adjust_for_gpus(
            cls, model: DeviceAwareModule, config: ModelConfigBase,
            model_execution_mode: ModelExecutionMode) -> DeviceAwareModule:
        """
        Updates a torch model so that input mini-batches are parallelized across the batch dimension to utilise
        multiple gpus. If model parallel is set to True and execution is in test mode, then model is partitioned to
        perform full volume inference.
        This assumes the model has been created, that the optimizer has not yet been created, and the the model has not
        been adjusted twice. This method should not be called externally. Use instead adjust_model_for_gpus
        or adjust_mean_teacher_model_for_gpus
        :returns Adjusted model
        """
        if config.use_gpu:
            model = model.cuda()
            logging.info(
                "Adjusting the model to use mixed precision training.")
            # If model parallel is set to True, then partition the network across all available gpus.
            if config.use_model_parallel:
                devices = config.get_cuda_devices()
                assert devices is not None  # for mypy
                model.partition_model(devices=devices)  # type: ignore
        else:
            logging.info(
                "Making no adjustments to the model because no GPU was found.")

        # Update model related config attributes (After Model Parallel Activated)
        config.adjust_after_mixed_precision_and_parallel(model)

        # DataParallel enables running the model with multiple gpus by splitting samples across GPUs
        # If the model is used in training mode, data parallel is activated by default.
        # Similarly, if model parallel is not activated, data parallel is used as a backup option
        use_data_parallel = (model_execution_mode == ModelExecutionMode.TRAIN
                             ) or (not config.use_model_parallel)
        if config.use_gpu and use_data_parallel:
            logging.info("Adjusting the model to use DataParallel")
            # Move all layers to the default GPU before activating data parallel.
            # This needs to happen even though we put the model to the GPU at the beginning of the method,
            # but we may have spread it across multiple GPUs later.
            model = model.cuda()
            model = DataParallelModel(model,
                                      device_ids=config.get_cuda_devices())

        return model
 def set_data_parallel(self, device_ids: Optional[List[Any]]) -> None:
     assert self.model is not None
     self.model = DataParallelModel(self.model, device_ids=device_ids)
Exemplo n.º 3
0
 def set_data_parallel(self, device_ids: Optional[List[Any]]) -> None:
     if self._model is None:
         raise ValueError(
             "Model must be created before it can be moved to Data Parellel."
         )
     self._model = DataParallelModel(self._model, device_ids=device_ids)
Exemplo n.º 4
0
class ModelAndInfo:
    """
    This class contains the model and optional associated information, as well as methods to create
    models and optimizers, move these to GPU and load state from checkpoints. Attributes are:
      config: the model configuration information
      model: the model created based on the config
      optimizer: the optimizer created based on the config and associated with the model
      checkpoint_path: the path load load checkpoint from, can be None
      is_mean_teacher: whether this is (intended to be) a mean teacher model
      is_adjusted: whether model adjustments (which cannot be done twice) have been applied
      checkpoint_epoch: the training epoch this model was created, if loaded from disk
      model_execution_mode: mode this model will be run in
    """
    def __init__(self,
                 config: ModelConfigBase,
                 model_execution_mode: ModelExecutionMode,
                 is_mean_teacher: bool = False,
                 checkpoint_path: Optional[Path] = None):
        """
        :param config: the model configuration information
        :param model_execution_mode: mode this model will be run in
        :param is_mean_teacher: whether this is (intended to be) a mean teacher model
        :param checkpoint_path: the path load load checkpoint from, can be None
        """
        self.config = config
        self.is_mean_teacher = is_mean_teacher
        self.checkpoint_path = checkpoint_path
        self.model_execution_mode = model_execution_mode

        self._model = None
        self._optimizer = None
        self.checkpoint_epoch = None
        self.is_adjusted = False

    @property
    def model(self) -> DeviceAwareModule:
        if not self._model:
            raise ValueError("Model has not been created.")
        return self._model

    @property
    def optimizer(self) -> Optimizer:
        if not self._optimizer:
            raise ValueError("Optimizer has not been created.")
        return self._optimizer

    def to_cuda(self) -> None:
        """
        Moves the model to GPU
        """
        if self._model is None:
            raise ValueError(
                "Model must be created before it can be moved to GPU.")
        self._model = self._model.cuda()

    def set_data_parallel(self, device_ids: Optional[List[Any]]) -> None:
        if self._model is None:
            raise ValueError(
                "Model must be created before it can be moved to Data Parellel."
            )
        self._model = DataParallelModel(self._model, device_ids=device_ids)

    def create_model(self) -> None:
        """
        Creates a model (with temperature scaling) according to the config given.
        """
        self._model = create_model_with_temperature_scaling(self.config)

    def try_load_checkpoint_for_model(self) -> bool:
        """
        Loads a checkpoint of a model. The provided model checkpoint must match the stored model.
        :return True if checkpoint exists and was loaded, False otherwise.
        """
        if self._model is None:
            raise ValueError(
                "Model must be created before it can be adjusted.")

        if not self.checkpoint_path:
            raise ValueError("No checkpoint provided")

        if not self.checkpoint_path.is_file():
            logging.warning(
                f'No checkpoint found at {self.checkpoint_path} current working dir {os.getcwd()}'
            )
            return False

        logging.info(f"Loading checkpoint {self.checkpoint_path}")
        # For model debugging, allow loading a GPU trained model onto the CPU. This will clearly only work
        # if the model is small.
        map_location = None if self.config.use_gpu else 'cpu'
        checkpoint = torch.load(str(self.checkpoint_path),
                                map_location=map_location)

        if isinstance(self._model, torch.nn.DataParallel):
            self._model.module.load_state_dict(checkpoint['state_dict'])
        else:
            self._model.load_state_dict(checkpoint['state_dict'])

        logging.info(
            f"Loaded model from checkpoint (epoch: {checkpoint['epoch']})")
        self.checkpoint_epoch = checkpoint['epoch']
        return True

    def adjust_model_for_gpus(self) -> None:
        """
        Updates the torch model so that input mini-batches are parallelized across the batch dimension to utilise
        multiple gpus. If model parallel is set to True and execution is in test mode, then model is partitioned to
        perform full volume inference.
        """
        if self._model is None:
            raise ValueError(
                "Model must be created before it can be adjusted.")

        # Adjusting twice causes an error.
        if self.is_adjusted:
            logging.debug("model_and_info.is_adjusted is already True")

        if self._optimizer:
            raise ValueError(
                "Create an optimizer only after creating and adjusting the model."
            )

        if self.config.use_gpu:
            self.to_cuda()
            logging.info(
                "Adjusting the model to use mixed precision training.")
            # If model parallel is set to True, then partition the network across all available gpus.
            if self.config.use_model_parallel:
                devices = self.config.get_cuda_devices()
                assert devices is not None  # for mypy
                self._model.partition_model(devices=devices)  # type: ignore
        else:
            logging.info(
                "Making no adjustments to the model because no GPU was found.")

        # Update model related config attributes (After Model Parallel Activated)
        self.config.adjust_after_mixed_precision_and_parallel(self._model)

        # DataParallel enables running the model with multiple gpus by splitting samples across GPUs
        # If the model is used in training mode, data parallel is activated by default.
        # Similarly, if model parallel is not activated, data parallel is used as a backup option
        use_data_parallel = (self.model_execution_mode
                             == ModelExecutionMode.TRAIN) or (
                                 not self.config.use_model_parallel)
        if self.config.use_gpu and use_data_parallel:
            logging.info("Adjusting the model to use DataParallel")
            # Move all layers to the default GPU before activating data parallel.
            # This needs to happen even though we put the model to the GPU at the beginning of the method,
            # but we may have spread it across multiple GPUs later.
            self.to_cuda()
            self.set_data_parallel(device_ids=self.config.get_cuda_devices())

        self.is_adjusted = True
        logging.debug("model_and_info.is_adjusted set to True")

    def create_summary_and_adjust_model_for_gpus(self) -> None:
        """
        Generates the model summary, which is required for model partitioning across GPUs, and then moves the model to
        GPU with data parallel/model parallel by calling adjust_model_for_gpus.
        """
        if self._model is None:
            raise ValueError(
                "Model must be created before it can be adjusted.")

        if self.config.is_segmentation_model:
            summary_for_segmentation_models(self.config, self._model)
        # Prepare for mixed precision training and data parallelization (no-op if already done).
        # This relies on the information generated in the model summary.
        self.adjust_model_for_gpus()

    def try_create_model_and_load_from_checkpoint(self) -> bool:
        """
        Creates a model as per the config, and loads the parameters from the given checkpoint path.
        Also updates the checkpoint_epoch.
        :return True if checkpoint exists and was loaded, False otherwise.
        """
        self.create_model()

        # for mypy
        assert self._model

        if self.checkpoint_path:
            # Load the stored model. If there is no checkpoint present, return immediately.
            return self.try_load_checkpoint_for_model()
        return True

    def try_create_model_load_from_checkpoint_and_adjust(self) -> bool:
        """
        Creates a model as per the config, and loads the parameters from the given checkpoint path.
        The model is then adjusted for data parallelism and mixed precision, running in TEST mode.
        Also updates the checkpoint_epoch.
        :return True if checkpoint exists and was loaded, False otherwise.
        """
        success = self.try_create_model_and_load_from_checkpoint()
        self.create_summary_and_adjust_model_for_gpus()
        return success

    def create_optimizer(self) -> None:
        """
        Creates a torch optimizer for the given model, and stores it as an instance variable in the current object.
        """
        # Make sure model is created before we create optimizer
        if self._model is None:
            raise ValueError(
                "Model checkpoint must be created before optimizer checkpoint can be loaded."
            )

        # Select optimizer type
        if self.config.optimizer_type in [
                OptimizerType.Adam, OptimizerType.AMSGrad
        ]:
            self._optimizer = torch.optim.Adam(
                self._model.parameters(),
                self.config.l_rate,
                self.config.adam_betas,
                self.config.opt_eps,
                self.config.weight_decay,
                amsgrad=self.config.optimizer_type == OptimizerType.AMSGrad)
        elif self.config.optimizer_type == OptimizerType.SGD:
            self._optimizer = torch.optim.SGD(
                self._model.parameters(),
                self.config.l_rate,
                self.config.momentum,
                weight_decay=self.config.weight_decay)
        elif self.config.optimizer_type == OptimizerType.RMSprop:
            self._optimizer = RMSprop(self._model.parameters(),
                                      self.config.l_rate,
                                      self.config.rms_alpha,
                                      self.config.opt_eps,
                                      self.config.weight_decay,
                                      self.config.momentum)
        else:
            raise NotImplementedError(
                f"Optimizer type {self.config.optimizer_type.value} is not implemented"
            )

    def try_load_checkpoint_for_optimizer(self) -> bool:
        """
        Loads a checkpoint of an optimizer.
        :return True if the checkpoint exists and optimizer state loaded, False otherwise
        """

        if self._optimizer is None:
            raise ValueError(
                "Optimizer must be created before optimizer checkpoint can be loaded."
            )

        if not self.checkpoint_path:
            logging.warning("No checkpoint path provided.")
            return False

        if not self.checkpoint_path.is_file():
            logging.warning(
                f'No checkpoint found at {self.checkpoint_path} current working dir {os.getcwd()}'
            )
            return False

        logging.info(f"Loading checkpoint {self.checkpoint_path}")
        # For model debugging, allow loading a GPU trained model onto the CPU. This will clearly only work
        # if the model is small.
        map_location = None if self.config.use_gpu else 'cpu'
        checkpoint = torch.load(str(self.checkpoint_path),
                                map_location=map_location)

        if self._optimizer:
            self._optimizer.load_state_dict(checkpoint['opt_dict'])

        logging.info(
            "Loaded optimizer from checkpoint (epoch: {checkpoint['epoch']})")
        self.checkpoint_epoch = checkpoint['epoch']
        return True

    def try_create_optimizer_and_load_from_checkpoint(self) -> bool:
        """
        Creates an optimizer and loads its state from a checkpoint.
        :return True if the checkpoint exists and optimizer state loaded, False otherwise
        """
        self.create_optimizer()
        if self.checkpoint_path:
            return self.try_load_checkpoint_for_optimizer()
        return True