Exemplo n.º 1
0
    def _init_apex_grad_scaler(self):
        # Check Apex availability
        if not is_apex_available():
            raise RuntimeError("Apex is not available. Can't use mixed precision")

        # "amp_args" are actually Apex Amp args
        self.amp_args = self.config.MODEL.AMP_PARAMS.AMP_ARGS
        logging.info(f"Setting AMP: using apex, args {self.amp_args}")
Exemplo n.º 2
0
 def _build_optimizer(self):
     """
     Build optimizers using the optimizer settings specified by user.
     For SGD, we support LARC as well. In order to use LARC, Apex must
     be installed.
     """
     optimizer_config = self.config["OPTIMIZER"]
     if optimizer_config.use_larc and optimizer_config.name != "sgd_fsdp":
         assert is_apex_available(), "Apex must be available to use LARC"
     optim = build_optimizer(optimizer_config)
     return optim
Exemplo n.º 3
0
    def set_amp_args(self):
        """
        Two automatic mixed precision implementations are available: Apex's and PyTorch's.

        - If Apex's AMP is enabled, amp_args is a dictionary containing arguments
        to be passed to amp.initialize. Set to None to disable amp.
        To enable mixed precision training, pass amp_args={"opt_level": "O1"} here.
        See https://nvidia.github.io/apex/amp.html for more info.

        - If Pytorch's AMP is enabled, no arguments are needed.
        """

        if self.config.MODEL.AMP_PARAMS.USE_AMP:
            assert (
                self.device.type == "cuda"
            ), "Mixed precision is only available on CUDA devices for now"

            # This will rightly fail if the setting is not correct
            self.amp_type = AmpType[self.config.MODEL.AMP_PARAMS.AMP_TYPE.upper()]

            # Check Apex availability
            if self.amp_type == AmpType.APEX:
                if not is_apex_available():
                    raise RuntimeError(
                        "Apex is not available. Can't use mixed precision"
                    )

                # "amp_args" are actually Apex Amp args
                self.amp_args = self.config.MODEL.AMP_PARAMS.AMP_ARGS
                logging.info(f"Setting AMP: using apex, args {self.amp_args}")

            elif self.amp_type == AmpType.PYTORCH:
                # if the optimizer is sharded or FSDP data parallel is used, then the GradScaler
                # needs to be shard-aware.
                if (
                    self.config["TRAINER"]["TASK_NAME"] == "self_supervision_fsdp_task"
                    or self.config["OPTIMIZER"]["name"] == "zero"
                ):
                    assert is_fairscale_sharded_available(), (
                        "To use ZeRO with PyTorch AMP, ShardedGradScaler() "
                        "from fairscale is needed. Please upgrade fairscale"
                    )
                    from fairscale.optim.grad_scaler import ShardedGradScaler

                    self.amp_grad_scaler = ShardedGradScaler()
                    logging.info("Setting AMP: using sharded grad scaler")
                else:
                    self.amp_grad_scaler = TorchGradScaler()
                    logging.info("Setting AMP: using pytorch grad scaler")
            logging.info(f"Setting AMP: {self.amp_type} - args: {self.amp_args}")

        else:
            self.amp_args, self.amp_type = None, None
            logging.info("Not using Automatic Mixed Precision")
Exemplo n.º 4
0
    def set_amp_args(self):
        """
        Two automatic mixed precision implementations are available: Apex's and PyTorch's.

        - If Apex's AMP is enabled, amp_args is a dictionary containing arguments
        to be passed to amp.initialize. Set to None to disable amp.
        To enable mixed precision training, pass amp_args={"opt_level": "O1"} here.
        See https://nvidia.github.io/apex/amp.html for more info.

        - If Pytorch's AMP is enabled, no arguments are needed.
        """

        if self.config.MODEL.AMP_PARAMS.USE_AMP:
            assert (
                self.device.type == "cuda"
            ), "Mixed precision is only available on CUDA devices for now"

            # This will rightly fail if the setting is not correct
            self.amp_type = AmpType[self.config.MODEL.AMP_PARAMS.AMP_TYPE.upper()]

            # Check Apex availability
            if self.amp_type == AmpType.APEX:
                if not is_apex_available():
                    raise RuntimeError(
                        "Apex is not available. Can't use mixed precision"
                    )

                # "amp_args" are actually Apex Amp args
                self.amp_args = self.config.MODEL.AMP_PARAMS.AMP_ARGS

            elif self.amp_type == AmpType.PYTORCH:
                # If the optimizer is sharded, then the GradScaler needs to be shard-aware
                self.amp_grad_scaler = (
                    ShardedGradScaler()
                    if self.config["OPTIMIZER"]["name"] == "zero"
                    else TorchGradScaler()
                )
            logging.info(f"Setting AMP: {self.amp_type} - args: {self.amp_args}")

        else:
            self.amp_args, self.amp_type = None, None
            logging.info("Not using Automatic Mixed Precision")
Exemplo n.º 5
0
    def _set_classy_state(self, state):
        """
        We load/set the model state setting here to resume correctly from the
        specified state. Usually called when resuming training from a previous
        model checkpoint.
        We set the model phase (train or eval), model weights,
        copy the model to correct device, initialize meters, initialize optimizers
        initialize amp state, set loss state, set the train phase number, iteration,
        recreate data iterators, etc.
        """
        logging.info("=======Updating classy state_dict from checkpoint=======")
        # here we load the state specific things only. The other extra variables
        # are init from the checkpoint in the trainer step.
        self.train = state["train"]
        self.base_model.set_classy_state(state["base_model"])
        # We need to set the model on correct device here unlike in the case of
        # training from scratch. The optimizer looks at the model parameters like
        # momentum etc. for getting the device info. Since in case of scratch
        # training, we don't have those and the optimizer just gets the inputs
        # as cuda inputs from the model, it can work. However, when we load from
        # a checkpoint, we already have these parameters and the type is CPU
        # (since the model isn't copied to gpu yet). The copy_model_to_gpu()
        # doesn't modify optimizer params device. The optimizer is constructed
        # with the CPU inputs. When the model runs, it rather sends CUDA.
        self.base_model.to(self.device)

        self._set_ema_model_state(state)

        for meter, meter_state in zip(self.meters, state["meters"]):
            meter.set_classy_state(meter_state)
        self.optimizer.set_classy_state(state["optimizer"])

        # restore amp state. It's called after amp.initialize is done.
        if "amp" in state:
            if self.amp_type == AmpType.APEX:
                if is_apex_available():
                    apex.amp.load_state_dict(state["amp"])
                else:
                    logging.warning(
                        "Loading a checkpoint which has amp state but apex isn't available now"
                    )
            else:
                self.amp_grad_scaler.load_state_dict(state["amp"])
        self.phase_idx = state["phase_idx"]
        self.train_phase_idx = state["train_phase_idx"]
        self.num_updates = state["num_updates"]
        self.losses = state["losses"]

        phase_type = "train" if self.train else "test"
        phase = self.phases[self.phase_idx]

        # Re-create the data iterator.
        # We are restoring from a checkpoint, which means we need to
        #   (1) set the right epoch
        #   (2) set the right start_iter
        # epoch number is `phase_idx + 1` since checkpoint's value is the epoch finished.
        # start_iter is computed in recreate_data_iterator based on iteration
        # number from the checkpoint state.
        self.recreate_data_iterator(
            phase_type,
            epoch=self.phase_idx + 1,
            compute_start_iter=True,
            train_phase_idx=self.train_phase_idx + 1,
        )

        # set the model to train or eval depending on what phase we are in
        self.base_model.train(phase["train"])

        if self.train and self.train_phase_idx >= 0:
            self.optimizer.on_epoch(self.where)
Exemplo n.º 6
0
from classy_vision.tasks import ClassificationTask, register_task
from classy_vision.tasks.classification_task import AmpType, BroadcastBuffersMode
from fairscale.nn import FullyShardedDataParallel
from iopath.common.file_io import g_pathmgr
from torch.cuda.amp import GradScaler as TorchGradScaler
from vissl.config import AttrDict
from vissl.data import build_dataloader, build_dataset, print_sampler_config
from vissl.models import build_model, convert_sync_bn
from vissl.optimizers import get_optimizer_param_groups
from vissl.utils.activation_checkpointing import manual_gradient_reduction
from vissl.utils.checkpoint import CheckpointLoader
from vissl.utils.ema_model import ModelEmaV2
from vissl.utils.misc import is_apex_available, is_fairscale_sharded_available


if is_apex_available():
    import apex


@register_task("self_supervision_task")
class SelfSupervisionTask(ClassificationTask):
    """
    A task prepares and holds all the components of a training like optimizer, datasets,
    dataloaders, losses, meters etc. Task also contains the variable like training iteration,
    epoch number etc. that are updated during the training.

    We prepare every single component according to the parameter settings user wants
    and specified in the yaml config file.

    Task also supports 2 additional things:
    1) converts the model BatchNorm layers to the synchronized batchnorm