Python parse_tpu_cores示例，pytorch_lightning.utilities.device_parser.parse_tpu_cores Python示例

示例#1

0

显示文件

文件： accelerator_connector.py 项目： oddeirikigland/pytorch-lightning

 def _map_devices_to_accelerator(self, accelerator: str) -> bool:
     if self.devices is None:
         return False
     if accelerator == _AcceleratorType.TPU and _TPU_AVAILABLE:
         if self.devices == "auto":
             self.devices = TPUAccelerator.auto_device_count()
         self.tpu_cores = device_parser.parse_tpu_cores(self.devices)
         return True
     if accelerator == _AcceleratorType.IPU and _IPU_AVAILABLE:
         if self.devices == "auto":
             self.devices = IPUAccelerator.auto_device_count()
         self.ipus = self.devices
         return True
     if accelerator == _AcceleratorType.GPU and torch.cuda.is_available():
         if self.devices == "auto":
             self.devices = GPUAccelerator.auto_device_count()
         self.gpus = self.devices
         self.parallel_device_ids = device_parser.parse_gpu_ids(
             self.devices)
         return True
     if accelerator == _AcceleratorType.CPU:
         if self.devices == "auto":
             self.devices = CPUAccelerator.auto_device_count()
         if not isinstance(self.devices, int):
             raise MisconfigurationException(
                 "The flag `devices` must be an int with `accelerator='cpu'`,"
                 f" got `devices={self.devices}` instead.")
         self.num_processes = self.devices
         return True
     return False

示例#2

0

显示文件

    def _parse_tpu_device_details(self, tpu_cores):
        self.trainer.tpu_cores = device_parser.parse_tpu_cores(tpu_cores)
        if self.trainer.tpu_cores is not None:
            if _TPU_AVAILABLE:
                self.trainer._device_type = DeviceType.TPU
                self.trainer.distributed_backend = "tpu"
            else:
                raise MisconfigurationException(
                    f"You have requested {self.trainer.tpu_cores} TPU cores but none is available."
                )

        self.trainer.tpu_id = self.trainer.tpu_cores[0] if isinstance(
            self.trainer.tpu_cores, list) else None

        # tpu state flags
        self.trainer.tpu_local_core_rank = None
        self.trainer.tpu_global_core_rank = None

示例#3

0

显示文件

文件： accelerator_connector.py 项目： nunenuh/pytorch-lightning

 def _map_devices_to_accelerator(self, accelerator: str) -> bool:
     if self.devices is None:
         return False
     if accelerator == DeviceType.TPU and _TPU_AVAILABLE:
         self.tpu_cores = device_parser.parse_tpu_cores(self.devices)
         return True
     if accelerator == DeviceType.IPU and _IPU_AVAILABLE:
         self.ipus = self.devices
         return True
     if accelerator == DeviceType.GPU and torch.cuda.is_available():
         self.gpus = self.devices
         self.parallel_device_ids = device_parser.parse_gpu_ids(
             self.devices)
         return True
     if accelerator == DeviceType.CPU:
         if not isinstance(self.devices, int):
             raise MisconfigurationException(
                 "The flag `devices` only supports integer for `accelerator='cpu'`,"
                 f" got `devices={self.devices}` instead.")
         self.num_processes = self.devices
         return True
     return False

示例#4

0

显示文件

    def __init__(
        self,
        num_processes,
        tpu_cores,
        ipus,
        distributed_backend,
        auto_select_gpus,
        gpus,
        num_nodes,
        sync_batchnorm,
        benchmark,
        replace_sampler_ddp,
        deterministic,
        precision,
        amp_type,
        amp_level,
        plugins,
    ):
        # initialization
        self._device_type = DeviceType.CPU
        self._distrib_type = None

        self.num_processes = num_processes
        self.tpu_cores = device_parser.parse_tpu_cores(tpu_cores)
        self.ipus = ipus
        self.distributed_backend = distributed_backend
        self.auto_select_gpus = auto_select_gpus
        self.gpus = gpus
        self.num_nodes = num_nodes
        self.sync_batchnorm = sync_batchnorm
        self.benchmark = benchmark
        self.replace_sampler_ddp = replace_sampler_ddp
        self.deterministic = deterministic
        self.precision = precision
        self.amp_type = amp_type.lower() if isinstance(amp_type, str) else None
        self.amp_level = amp_level
        self.is_slurm_managing_tasks = False

        self._precision_plugin: Optional[PrecisionPlugin] = None
        self._training_type_plugin: Optional[TrainingTypePlugin] = None
        self._cluster_environment: Optional[ClusterEnvironment] = None

        plugins = plugins if plugins is not None else []

        if isinstance(plugins, str):
            plugins = [plugins]

        if not isinstance(plugins, Sequence):
            plugins = [plugins]

        self.plugins = plugins

        # for gpus allow int, string and gpu list
        if auto_select_gpus and isinstance(gpus, int):
            self.gpus = pick_multiple_gpus(gpus)

        self.parallel_device_ids = device_parser.parse_gpu_ids(self.gpus)

        self.set_distributed_mode()
        self.configure_slurm_ddp()

        self.handle_given_plugins()

        self._training_type_plugin_resolved = False
        self.accelerator = self.select_accelerator()

        # override dist backend when using tpus
        if self.on_tpu:
            self.distributed_backend = "tpu"

        # init flags for SLURM+DDP to work
        self.world_size = 1
        self.interactive_ddp_procs = []
        self.global_rank = 0

        # benchmarking
        # TODO: should this be moved to GPU accelerator?
        torch.backends.cudnn.benchmark = self.benchmark

        # determinism for cudnn
        # TODO: should this be moved to GPU accelerator?
        torch.backends.cudnn.deterministic = deterministic
        if deterministic:
            # fixing non-deterministic part of horovod
            # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383
            os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0)

        self.replace_sampler_ddp = replace_sampler_ddp

示例#5

0

显示文件

    def on_trainer_init(
        self,
        num_processes,
        tpu_cores,
        accelerator,
        distributed_backend,
        auto_select_gpus,
        gpus,
        num_nodes,
        log_gpu_memory,
        sync_batchnorm,
        benchmark,
        replace_sampler_ddp,
        deterministic,
    ):
        # temp until we remove all dist backend references
        distributed_backend = self._map_deprecated_dist_backend(
            accelerator, distributed_backend)

        self.trainer.deterministic = deterministic

        torch.backends.cudnn.deterministic = self.trainer.deterministic
        if self.trainer.deterministic:
            # fixing non-deterministic part of horovod
            # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383
            os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0)

        # distributed backend choice
        self.trainer.distributed_backend = distributed_backend.lower(
        ) if distributed_backend else None

        # init the default rank if exists
        # we need to call this here or NVIDIA flags and other messaging in init will show on all ranks
        # this way we only show it on rank 0
        if 'LOCAL_RANK' in os.environ:
            rank_zero_only.rank = int(os.environ['LOCAL_RANK'])

        # benchmarking
        self.trainer.benchmark = benchmark
        torch.backends.cudnn.benchmark = self.trainer.benchmark

        # Transfer params
        self.trainer.num_nodes = num_nodes
        self.trainer.log_gpu_memory = log_gpu_memory

        # sync-bn backend
        self.trainer.sync_batchnorm = sync_batchnorm

        self.trainer.tpu_cores = device_parser.parse_tpu_cores(tpu_cores)
        self.trainer.on_tpu = self.trainer.tpu_cores is not None

        self.trainer.tpu_id = self.trainer.tpu_cores[0] if isinstance(
            self.trainer.tpu_cores, list) else None

        if num_processes != 1 and distributed_backend != "ddp_cpu":
            rank_zero_warn(
                "num_processes is only used for distributed_backend=\"ddp_cpu\". Ignoring it."
            )
        self.trainer.num_processes = num_processes

        # override with environment flag
        gpus = os.environ.get('PL_TRAINER_GPUS', gpus)
        self.trainer.gpus = gpus

        # for gpus allow int, string and gpu list
        if auto_select_gpus and isinstance(gpus, int):
            self.trainer.gpus = self.trainer.tuner.pick_multiple_gpus(gpus)

        self.trainer.data_parallel_device_ids = device_parser.parse_gpu_ids(
            self.trainer.gpus)
        self.trainer.root_gpu = device_parser.determine_root_gpu_device(
            self.trainer.data_parallel_device_ids)
        self.trainer.root_device = torch.device("cpu")

        self.trainer.on_gpu = True if (
            self.trainer.data_parallel_device_ids
            and torch.cuda.is_available()) else False

        # tpu state flags
        self.trainer.use_tpu = False
        self.trainer.tpu_local_core_rank = None
        self.trainer.tpu_global_core_rank = None

        # distributed backend choice
        self.set_distributed_mode()

        # override dist backend when using tpus
        if self.trainer.on_tpu:
            self.trainer.distributed_backend = "tpu"
            self.trainer.use_tpu = True

        # init flags for SLURM+DDP to work
        self.trainer.world_size = 1
        self.trainer.interactive_ddp_procs = []

        # link up SLURM
        # TODO: this should be taken out of here... but depends too much on DDP
        self.trainer.slurm_connector.on_trainer_init(self.trainer.num_nodes)
        self.trainer.node_rank = self.determine_ddp_node_rank()
        self.trainer.local_rank = self.determine_local_rank()
        self.trainer.global_rank = 0

        # NVIDIA setup
        self.set_nvidia_flags(self.trainer.is_slurm_managing_tasks,
                              self.trainer.data_parallel_device_ids)

        self.trainer.on_colab_kaggle = os.getenv('COLAB_GPU') or os.getenv(
            'KAGGLE_URL_BASE')

        self.trainer.replace_sampler_ddp = replace_sampler_ddp

示例#6

0

显示文件

 def parse_devices(
     devices: Union[int, str,
                    List[int]]) -> Optional[Union[int, List[int]]]:
     """Accelerator device parsing logic."""
     return device_parser.parse_tpu_cores(devices)

示例#7

0

显示文件

    def __init__(
        self,
        num_processes,
        tpu_cores,
        distributed_backend,
        auto_select_gpus,
        gpus,
        num_nodes,
        sync_batchnorm,
        benchmark,
        replace_sampler_ddp,
        deterministic,
        precision,
        amp_type,
        amp_level,
        cluster_environment,
    ):
        # initialization
        self._device_type = DeviceType.CPU
        self._distrib_type = None

        self.num_processes = num_processes
        self.tpu_cores = device_parser.parse_tpu_cores(tpu_cores)
        self.distributed_backend = distributed_backend
        self.auto_select_gpus = auto_select_gpus
        self.gpus = gpus
        self.num_nodes = num_nodes
        self.sync_batchnorm = sync_batchnorm
        self.benchmark = benchmark
        self.replace_sampler_ddp = replace_sampler_ddp
        self.deterministic = deterministic
        self.precision = precision
        self.amp_type = amp_type.lower() if isinstance(amp_type, str) else None
        self.amp_level = amp_level
        self.cluster_environment = cluster_environment
        self.is_slurm_managing_tasks = False

        # init the default rank if exists
        # we need to call this here or NVIDIA flags and other messaging in init will show on all ranks
        # this way we only show it on rank 0
        if "LOCAL_RANK" in os.environ:
            rank_zero_only.rank = int(os.environ["LOCAL_RANK"])

        # for gpus allow int, string and gpu list
        if auto_select_gpus and isinstance(gpus, int):
            self.gpus = pick_multiple_gpus(gpus)

        self.parallel_device_ids = device_parser.parse_gpu_ids(self.gpus)
        self.root_gpu = device_parser.determine_root_gpu_device(
            self.parallel_device_ids)

        self.set_distributed_mode()
        self.configure_slurm_ddp()

        self.accelerator = self.select_accelerator()

        # override dist backend when using tpus
        if self.on_tpu:
            self.distributed_backend = "tpu"
            self.use_tpu = True

        # init flags for SLURM+DDP to work
        self.world_size = 1
        self.interactive_ddp_procs = []
        self.global_rank = 0

        # NVIDIA setup
        # self.set_nvidia_flags(self.trainer.is_slurm_managing_tasks, self.trainer.data_parallel_device_ids)

        # benchmarking
        # TODO: should this be moved to GPU accelerator?
        torch.backends.cudnn.benchmark = self.benchmark

        # determinism for cudnn
        # TODO: should this be moved to GPU accelerator?
        torch.backends.cudnn.deterministic = deterministic
        if deterministic:
            # fixing non-deterministic part of horovod
            # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383
            os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0)

        # TODO: move this to TPU accelerator/plugin
        self.on_colab_kaggle = os.getenv("COLAB_GPU") or os.getenv(
            "KAGGLE_URL_BASE")

        self.replace_sampler_ddp = replace_sampler_ddp

示例#8

0

显示文件

    def __init__(
            self,
            logger: Union[LightningLoggerBase, Iterable[LightningLoggerBase],
                          bool] = True,
            checkpoint_callback: Union[ModelCheckpoint, bool] = True,
            early_stop_callback: Optional[Union[EarlyStopping, bool]] = False,
            callbacks: Optional[List[Callback]] = None,
            default_root_dir: Optional[str] = None,
            gradient_clip_val: float = 0,
            process_position: int = 0,
            num_nodes: int = 1,
            num_processes: int = 1,
            gpus: Optional[Union[List[int], str, int]] = None,
            auto_select_gpus: bool = False,
            tpu_cores: Optional[Union[List[int], str, int]] = None,
            log_gpu_memory: Optional[str] = None,
            progress_bar_refresh_rate: int = 1,
            overfit_batches: Union[int, float] = 0.0,
            track_grad_norm: Union[int, float, str] = -1,
            check_val_every_n_epoch: int = 1,
            fast_dev_run: bool = False,
            accumulate_grad_batches: Union[int, Dict[int, int],
                                           List[list]] = 1,
            max_epochs: int = 1000,
            min_epochs: int = 1,
            max_steps: Optional[int] = None,
            min_steps: Optional[int] = None,
            limit_train_batches: Union[int, float] = 1.0,
            limit_val_batches: Union[int, float] = 1.0,
            limit_test_batches: Union[int, float] = 1.0,
            val_check_interval: Union[int, float] = 1.0,
            log_save_interval: int = 100,
            row_log_interval: int = 50,
            distributed_backend: Optional[str] = None,
            sync_batchnorm: bool = False,
            precision: int = 32,
            weights_summary: Optional[str] = ModelSummary.MODE_DEFAULT,
            weights_save_path: Optional[str] = None,
            num_sanity_val_steps: int = 2,
            truncated_bptt_steps: Optional[int] = None,
            resume_from_checkpoint: Optional[str] = None,
            profiler: Optional[Union[BaseProfiler, bool]] = None,
            benchmark: bool = False,
            deterministic: bool = False,
            reload_dataloaders_every_epoch: bool = False,
            auto_lr_find: Union[bool, str] = False,
            replace_sampler_ddp: bool = True,
            terminate_on_nan: bool = False,
            auto_scale_batch_size: Union[str, bool] = False,
            prepare_data_per_node: bool = True,
            amp_backend: str = 'native',
            amp_level: str = 'O2',  # backward compatible, todo: remove in v1.0.0
            val_percent_check:
        float = None,  # backward compatible, todo: remove in v0.10.0
            test_percent_check:
        float = None,  # backward compatible, todo: remove in v0.10.0
            train_percent_check:
        float = None,  # backward compatible, todo: remove in v0.10.0
            overfit_pct:
        float = None,  # backward compatible, todo: remove in v1.0.0
    ):
        super().__init__()

        self.deterministic = deterministic
        torch.backends.cudnn.deterministic = self.deterministic
        if self.deterministic:
            # fixing non-deterministic part of horovod
            # https://github.com/PyTorchLightning/pytorch-lightning/pull/1572/files#r420279383
            os.environ["HOROVOD_FUSION_THRESHOLD"] = str(0)

        # init the default rank if exists
        # we need to call this here or NVIDIA flags and other messaging in init will show on all ranks
        # this way we only show it on rank 0
        if 'LOCAL_RANK' in os.environ:
            rank_zero_only.rank = int(os.environ['LOCAL_RANK'])

        # tracks internal state for debugging
        self.dev_debugger = InternalDebugger(self)
        self.config_validator = ConfigValidator(self)
        self.data_connector = DataConnector(self)
        self.lr_scheduler_connector = LRSchedulerConnector(self)
        self.accelerator_connector = AcceleratorConnector(self)
        self.logger_connector = LoggerConnector(self)
        self.model_connector = ModelConnector(self)
        self.initializer = Initializer(self)
        self.tuner = Tuner(self)
        self.accelerator_backend = None

        # loops
        self.evaluation_loop = EvaluationLoop(self)
        self.train_loop = TrainLoop(self)

        # training bookeeping
        self.total_batch_idx = 0
        self.running_loss = TensorRunningAccum(window_length=20)
        self.batch_idx = 0
        self.num_training_batches = 0
        self.num_val_batches = []
        self.num_sanity_val_batches = []
        self.num_test_batches = []
        self.train_dataloader = None
        self.test_dataloaders = None
        self.val_dataloaders = None

        # when true, prints test results
        self.verbose_test = True

        # when .test() is called, it sets this
        self.tested_ckpt_path = None

        # training state
        self.model = None
        self.datamodule = None
        self.testing = False
        self.prepare_data_per_node = prepare_data_per_node
        self.lr_schedulers = []
        self.optimizers = None
        self.optimizer_frequencies = []
        self.global_step = 0
        self.current_epoch = 0
        self.interrupted = False
        self.should_stop = False
        self.running_sanity_check = False
        self._state = TrainerState.INITIALIZING

        self._default_root_dir = default_root_dir or os.getcwd()
        self._weights_save_path = weights_save_path or self._default_root_dir

        # init callbacks
        self.callbacks = callbacks or []

        # configure early stop callback
        # creates a default one if none passed in
        early_stop_callback = self.configure_early_stopping(
            early_stop_callback)
        if early_stop_callback:
            self.callbacks.append(early_stop_callback)

        # configure checkpoint callback
        # it is important that this is the last callback to run
        # pass through the required args to figure out defaults
        checkpoint_callback = self.configure_checkpoint_callback(
            checkpoint_callback)
        if checkpoint_callback:
            self.callbacks.append(checkpoint_callback)

        # TODO refactor codebase (tests) to not directly reach into these callbacks
        self.checkpoint_callback = checkpoint_callback
        self.early_stop_callback = early_stop_callback

        self.on_init_start()

        # benchmarking
        self.benchmark = benchmark
        torch.backends.cudnn.benchmark = self.benchmark

        # Transfer params
        self.num_nodes = num_nodes
        self.log_gpu_memory = log_gpu_memory

        # sync-bn backend
        self.sync_batchnorm = sync_batchnorm

        self.gradient_clip_val = gradient_clip_val
        self.check_val_every_n_epoch = check_val_every_n_epoch

        if not isinstance(track_grad_norm,
                          (int, float)) and track_grad_norm != 'inf':
            raise MisconfigurationException(
                "track_grad_norm can be an int, a float or 'inf' (infinity norm)."
            )
        self.track_grad_norm = float(track_grad_norm)

        self.tpu_cores = device_parser.parse_tpu_cores(tpu_cores)
        self.on_tpu = self.tpu_cores is not None

        self.tpu_id = self.tpu_cores[0] if isinstance(self.tpu_cores,
                                                      list) else None

        if num_processes != 1 and distributed_backend != "ddp_cpu":
            rank_zero_warn(
                "num_processes is only used for distributed_backend=\"ddp_cpu\". Ignoring it."
            )
        self.num_processes = num_processes

        self.weights_summary = weights_summary

        self.max_epochs = max_epochs
        self.min_epochs = min_epochs
        self.max_steps = max_steps
        self.min_steps = min_steps

        if num_sanity_val_steps == -1:
            self.num_sanity_val_steps = float('inf')
        else:
            self.num_sanity_val_steps = num_sanity_val_steps

        self.reload_dataloaders_every_epoch = reload_dataloaders_every_epoch

        self.auto_lr_find = auto_lr_find
        self.auto_scale_batch_size = auto_scale_batch_size
        self._is_data_prepared = False
        self.replace_sampler_ddp = replace_sampler_ddp

        self.truncated_bptt_steps = truncated_bptt_steps
        self.resume_from_checkpoint = resume_from_checkpoint
        self.terminate_on_nan = terminate_on_nan
        self.shown_warnings = set()

        self.fast_dev_run = fast_dev_run
        if self.fast_dev_run:
            limit_train_batches = 1
            limit_val_batches = 1
            limit_test_batches = 1
            self.num_sanity_val_steps = 0
            self.max_epochs = 1
            rank_zero_info(
                'Running in fast_dev_run mode: will run a full train,'
                ' val and test loop using a single batch')

        # configure profiler
        if profiler is True:
            profiler = SimpleProfiler()
        self.profiler = profiler or PassThroughProfiler()

        # accumulated grads
        self.accumulate_grad_batches = accumulate_grad_batches
        self.configure_accumulated_gradients(accumulate_grad_batches)

        # override with environment flag
        gpus = os.environ.get('PL_TRAINER_GPUS', gpus)

        # for gpus allow int, string and gpu list
        if auto_select_gpus and isinstance(gpus, int):
            self.gpus = self.tuner.pick_multiple_gpus(gpus)
        else:
            self.gpus = gpus

        self.data_parallel_device_ids = device_parser.parse_gpu_ids(self.gpus)
        self.root_gpu = device_parser.determine_root_gpu_device(
            self.data_parallel_device_ids)
        self.root_device = torch.device("cpu")

        self.on_gpu = True if (self.data_parallel_device_ids
                               and torch.cuda.is_available()) else False

        # tpu state flags
        self.use_tpu = False
        self.tpu_local_core_rank = None
        self.tpu_global_core_rank = None

        # distributed backend choice
        self.distributed_backend = distributed_backend
        self.set_distributed_mode(distributed_backend)

        # override dist backend when using tpus
        if self.on_tpu:
            self.distributed_backend = 'tpu'
            self.init_tpu()

        # init flags for SLURM+DDP to work
        self.world_size = 1
        self.interactive_ddp_procs = []
        self.configure_slurm_ddp(self.num_nodes)
        self.node_rank = self.determine_ddp_node_rank()
        self.local_rank = self.determine_local_rank()
        self.global_rank = 0

        # NVIDIA setup
        self.set_nvidia_flags(self.is_slurm_managing_tasks,
                              self.data_parallel_device_ids)

        self._progress_bar_callback = self.configure_progress_bar(
            progress_bar_refresh_rate, process_position)

        # logging
        self.configure_logger(logger)
        self.log_save_interval = log_save_interval
        self.row_log_interval = row_log_interval

        # how much of the data to use
        # TODO: remove in 0.10.0
        if overfit_pct is not None:
            rank_zero_warn(
                "Argument `overfit_pct` is now set by `overfit_batches` since v0.8.0"
                " and this argument will be removed in v0.10.0",
                DeprecationWarning,
            )
            overfit_batches = overfit_pct

        # TODO: remove in 0.10.0
        if val_percent_check is not None:
            rank_zero_warn(
                "Argument `val_percent_check` is now set by `limit_val_batches` since v0.8.0"
                " and this argument will be removed in v0.10.0",
                DeprecationWarning,
            )
            limit_val_batches = val_percent_check

        # TODO: remove in 0.10.0
        if test_percent_check is not None:
            rank_zero_warn(
                "Argument `test_percent_check` is now set by `limit_test_batches` since v0.8.0"
                " and this argument will be removed in v0.10.0",
                DeprecationWarning,
            )
            limit_test_batches = test_percent_check

        # TODO: remove in 0.10.0
        if train_percent_check is not None:
            rank_zero_warn(
                "Argument `train_percent_check` is now set by `limit_train_batches` since v0.8.0"
                " and this argument will be removed in v0.10.0",
                DeprecationWarning,
            )
            limit_train_batches = train_percent_check

        self.limit_train_batches = _determine_batch_limits(
            limit_train_batches, 'limit_train_batches')
        self.limit_val_batches = _determine_batch_limits(
            limit_val_batches, 'limit_val_batches')
        self.limit_test_batches = _determine_batch_limits(
            limit_test_batches, 'limit_test_batches')
        self.val_check_interval = _determine_batch_limits(
            val_check_interval, 'val_check_interval')
        self.overfit_batches = _determine_batch_limits(overfit_batches,
                                                       'overfit_batches')
        self.determine_data_use_amount(self.overfit_batches)

        # AMP init
        # These are the only lines needed after v0.8.0
        # we wrap the user's forward with autocast and give it back at the end of fit
        self.autocast_original_forward = None
        self.precision = precision
        self.scaler = None

        self.amp_level = amp_level
        self.initializer.init_amp(amp_backend)

        self.on_colab_kaggle = os.getenv('COLAB_GPU') or os.getenv(
            'KAGGLE_URL_BASE')

        # Callback system
        self.on_init_end()