def begin_fit(self):
     self.run.model = fp16.convert_network(self.model, dtype=torch.float16)
     self.model_pgs, self.master_pgs = get_master(self.opt,
                                                  self.flat_master)
     #Changes the optimizer so that the optimization step is done in FP32.
     self.run.opt.param_groups = self.master_pgs  #Put those param groups inside our runner.
     if self.dynamic: self.count = 0
Пример #2
0
 def begin_fit(self):
     self.run.model = fp16.convert_network(self.model, dtype=torch.float16)
     self.model_pgs, self.master_pgs = get_master(self.opt, self.flat_master)
     #Changes the optimizer so that the optimization step is done in FP32.
     param_groups = self.opt.param_groups #Load the old param groups to get the HP values
     for (pg,mp) in zip(param_groups,self.master_pgs): pg['params'] = mp #Replace the parameters by the new ones
     self.run.opt.param_groups = param_groups #Put those param groups inside our runner.
     if self.dynamic: self.count = 0
Пример #3
0
 def on_train_begin(self):
     """Convert network to float16"""
     self.learner._model = convert_network(self.learner._model, float16)
     self.model_param_groups, self.master_param_groups = get_param_groups(
         self.learner._optimizer)
     # self.learner._optimizer.param_groups = self.master_param_groups
     # self.learner._optimizer.zero_grad = self.learner._model.zero_grad
     copy_param_to_optimizer(self.learner._optimizer,
                             self.master_param_groups)
     if self.dynamic: self.count = 0
    def begin_fit(self):
        # Helper 1: Convert model (except for any batchnorm layers) to FP16:
        self.run.model = fp16.convert_network(self.model, dtype=torch.float16)

        # Helper 2: Creating a FP32 master copy of parameter weights
        self.model_param_groups, self.master_param_groups = get_master(
            self.opt, self.flat_master)
        # To place those FP32 master copy param groups inside the runner:
        self.run.opt.param_groups = self.master_param_groups

        # To count number of iterations without gradient overflow occurring.
        if self.dynamic: self.count = 0
Пример #5
0
def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs=None):
    from apex.parallel import DistributedDataParallel as apex_DDP
    from .amp import init as amp_init

    optimizers_was_list = False
    if isinstance(optimizers, torch.optim.Optimizer):
        optimizers = [optimizers]
    elif optimizers is None:
        optimizers = []
    elif isinstance(optimizers, list):
        optimizers_was_list = True
        check_optimizers(optimizers)
    else:
        check_optimizers([optimizers])
        raise TypeError("optimizers must be either a single optimizer or a list of optimizers.")

    if isinstance(models, torch.nn.Module):
        models_was_list = False
        models = [models]
    elif isinstance(models, list):
        models_was_list = True
    else:
        raise TypeError("models must be either a single model or a list of models.")

    check_models(models)

    if not _amp_state.allow_incoming_model_not_fp32:
        check_params_fp32(models)


    # In the future, when FP16_Optimizer can be deprecated and master weights can
    # become an attribute, remember to stash master weights before casting the model.

    if properties.cast_model_type:
        if properties.keep_batchnorm_fp32:
            for model in models:
                convert_network(model, properties.cast_model_type)
        else:
            for model in models:
                model.to(properties.cast_model_type)

        input_caster = functools.partial(to_type, properties.cast_model_type)
        if cast_model_outputs is not None:
            output_caster = functools.partial(to_type, cast_model_outputs)
        else:
            output_caster = functools.partial(to_type, torch.float32)

        for model in models:
            # Patch the forward method to cast incoming data to the correct type, and
            # outgoing data to float32, so "the user never needs to call .half()."
            # I like writing things explicitly more than decorators.
            def patch_forward(old_fwd):
                def new_fwd(*args, **kwargs):
                    output = old_fwd(*applier(args, input_caster),
                                     **applier(kwargs, input_caster))
                    return applier(output, output_caster)
                return new_fwd

            model.forward = patch_forward(model.forward)

        # State dict trick to recast any preexisting per-param state tensors 
        for optimizer in optimizers:
            optimizer.load_state_dict(optimizer.state_dict())
    elif cast_model_outputs is not None:
        output_caster = functools.partial(to_type, cast_model_outputs)

        for model in models:
            def patch_forward(old_fwd):
                def new_fwd(*args, **kwargs):
                    output = old_fwd(*args, **kwargs)
                    return applier(output, output_caster)
                return new_fwd

            model.forward = patch_forward(model.forward)

    for i, optimizer in enumerate(optimizers):
        # Still need to special case this for the first pass
        if isinstance(optimizer, FusedAdam):
            optimizers[i] = wrap_fused_adam(optimizer, properties)
        else:
            optimizers[i] = _process_optimizer(optimizer, properties)

    _amp_state.loss_scalers = []
    for _ in range(num_losses):
        _amp_state.loss_scalers.append(LossScaler(properties.loss_scale))

    if properties.patch_torch_functions:
        # handle is unused here. It's accessible later through a global value anyway.
        handle = amp_init(loss_scale=properties.loss_scale, verbose=(_amp_state.verbosity == 2))
        for optimizer in optimizers:
            # Disable Amp casting for the optimizer step, because it should only be
            # applied to FP32 master params anyway.
            def patch_step(old_step):
                def new_step(*args, **kwargs):
                    with disable_casts():
                        output = old_step(*args, **kwargs)
                    return output
                return new_step

            optimizer.step = patch_step(optimizer.step)

    if optimizers_was_list:
        if models_was_list:
            return models, optimizers
        else:
            return models[0], optimizers
    else:
        if models_was_list:
            if len(optimizers) == 0:
                return models
            else:
                return models, optimizers[0]
        else:
            if len(optimizers) == 0:
                return models[0]
            else:
                return models[0], optimizers[0]
Пример #6
0
def _initialize(models, optimizers, properties):
    from apex.parallel import DistributedDataParallel as apex_DDP
    from .amp import init as amp_init

    if isinstance(optimizers, torch.optim.Optimizer):
        optimizers_was_list = False
        optimizers = [optimizers]
    elif isinstance(optimizers, list):
        optimizers_was_list = True
    else:
        raise TypeError(
            "optimizers must be either a single optimizer or a list of optimizers."
        )

    if isinstance(models, torch.nn.Module):
        models_was_list = False
        models = [models]
    elif isinstance(models, list):
        models_was_list = True
    else:
        raise TypeError(
            "models must be either a single model or a list of models.")

    check_models(models)

    check_params_fp32(models)

    check_optimizers(optimizers)

    # In the future, when FP16_Optimizer can be deprecated and master weights can
    # become an attribute, remember to stash master weights before casting the model.

    if properties.cast_model_type:
        if properties.keep_batchnorm_fp32:
            for model in models:
                convert_network(model, properties.cast_model_type)
        else:
            for model in models:
                model.to(properties.cast_model_type)

        caster = functools.partial(to_type, properties.cast_model_type)

        # Patch the forward method to cast incoming data to the correct type.
        # I like writing things explicitly more than decorators.
        def patch_forward(old_fwd):
            def new_fwd(*args, **kwargs):
                return old_fwd(*applier(args, caster),
                               **applier(kwargs, caster))

            return new_fwd

        model.forward = patch_forward(model.forward)

        # State dict trick to recast any preexisting per-param state tensors
        for optimizer in optimizers:
            optimizer.load_state_dict(optimizer.state_dict())

    if properties.master_weights:
        for i, optimizer in enumerate(optimizers):
            if isinstance(optimizer, FusedAdam):
                optimizers[i] = wrap_fused_adam(optimizer, properties)
            if properties.loss_scale == "dynamic":
                optimizers[i] = FP16_Optimizer_general(optimizer,
                                                       dynamic_loss_scale=True,
                                                       verbose=False)
            else:
                optimizers[i] = FP16_Optimizer_general(
                    optimizer,
                    static_loss_scale=properties.loss_scale,
                    verbose=False)
    else:
        for optimizer in optimizers:
            optimizer.loss_scaler = LossScaler(properties.loss_scale)

    if properties.patch_torch_functions:
        # handle is unused here. It's accessible later through a global value anyway.
        handle = amp_init(loss_scale=properties.loss_scale)
        for optimizer in optimizers:
            # Disable Amp casting for the optimizer step, because it should only be
            # applied to FP32 master params anyway.
            def patch_step(old_step):
                def new_step(*args, **kwargs):
                    with disable_casts():
                        output = old_step(*args, **kwargs)
                    return output

                return new_step

            optimizer.step = patch_step(optimizer.step)

    if optimizers_was_list:
        if models_was_list:
            return models, optimizers
        else:
            return models[0], optimizers
    else:
        if models_was_list:
            return models, optimizers[0]
        else:
            return models[0], optimizers[0]
Пример #7
0
model = model_to_half(model)


def check_weights(model):
    for i, t in enumerate([torch.float16, torch.float32, torch.float16]):
        assert model[i].weight.dtype == t
        assert model[i].bias.dtype == t


check_weights(model)

# In Apex, the function that does this for us is `convert_network`. We can use it to put the model in FP16 or back to FP32.

model = nn.Sequential(nn.Linear(10, 30), nn.BatchNorm1d(30),
                      nn.Linear(30, 2)).cuda()
model = fp16.convert_network(model, torch.float16)
check_weights(model)

# ### Creating the master copy of the parameters

# From our model parameters (mostly in FP16), we'll want to create a copy in FP32 (master parameters) that we will use for the step in the optimizer. Optionally, we concatenate all the parameters to do one flat big tensor, which can make that step a little bit faster.

from torch.nn.utils import parameters_to_vector


def get_master(model, flat_master=False):
    model_params = [
        param for param in model.parameters() if param.requires_grad
    ]
    if flat_master:
        master_param = parameters_to_vector(