def _make_params(self): """Create the internal attributes needed by this module.""" w = getattr(self.module, self.name) height = w.data.shape[0] width = w.view(height, -1).data.shape[1] # Create u and v as torch.Parameters from a standard normal distribution # with the height and width from the chosen attribute in the internal module. u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=False) v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=False) # Normalize u and v using L2 and create torch.Parameter for the chosen attribute of the internal module. u.data = l2normalize(u.data) v.data = l2normalize(v.data) w_bar = Parameter(w.data) # Delete the chosen attributes in the internal module, the attribute is tracked in this module instead. del self.module._parameters[self.name] # Register the new parameters. self.module.register_parameter(self.name + "_u", u) self.module.register_parameter(self.name + "_v", v) self.module.register_parameter(self.name + "_bar", w_bar)
def _make_params(self): w = getattr(self.module, self.name) height = w.data.shape[0] width = w.view(height, -1).data.shape[1] u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=True) v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=True) u.data = l2normalize(u.data) v.data = l2normalize(v.data) w_bar = Parameter(w.data) del self.module._parameters[self.name] self.module.register_parameter(self.name + "_u", u) self.module.register_parameter(self.name + "_v", v) self.module.register_parameter(self.name + "_bar", w_bar)
def __call__(self, mu: Parameter, rho: Parameter) -> TwoParameters: """Call Arguments: mu (nn.Parameter): mu parameter to be initialized rho (nn.Parameter): rho parameter to be initialized Returns: (nn.Parameter): mu initialized (nn.Parameter): rho initialized """ mu.data = mu.data.uniform_(*self.mu_range) rho.data = rho.data.uniform_(*self.rho_range) return mu, rho
def fixup(p: Parameter, is_sharded: bool, size: torch.Size) -> Parameter: assert isinstance(p, Parameter) p.data = p.data.clone() # move tensors out of shared memory p._is_sharded = is_sharded p._orig_size = size return p
def _make_params(self): w = getattr(self.module, self.name) height = w.data.shape[0] width = w.view(height, -1).data.shape[1] u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=False) v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=False) u.data = l2normalize(u.data) v.data = l2normalize(v.data) w_bar = Parameter(w.data) del self.module._parameters[self.name] self.module.register_parameter(self.name + "_u", u) self.module.register_parameter(self.name + "_v", v) self.module.register_parameter(self.name + "_bar", w_bar)
def _make_params(self): """ Set the parameters from scratch """ # Create the parameters w = getattr(self.module, self.name) height = w.data.shape[0] width = w.view(height, -1).data.shape[1] u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=False) v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=False) u.data = self.l2normalize(u.data) v.data = self.l2normalize(v.data) w_bar = Parameter(w.data) # Regist the parameter into the module del self.module._parameters[self.name] self.module.register_parameter(self.name + "_u", u) self.module.register_parameter(self.name + "_v", v) self.module.register_parameter(self.name + "_bar", w_bar)
def _make_params(self): w = getattr(self.module, self.name) # get the weight first 100x512x4x4 height = w.data.shape[ 0] # nn.ConvTranspose2d(z_dim, conv_dim * mult, 4) -- 100 width = w.view(height, -1).data.shape[1] # 8192 u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=False) # u is a random vector v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=False) # v is a random vector u.data = l2normalize(u.data) # normalize v.data = l2normalize(v.data) # normalize w_bar = Parameter(w.data) del self.module._parameters[self.name] self.module.register_parameter(self.name + "_u", u) self.module.register_parameter(self.name + "_v", v) self.module.register_parameter(self.name + "_bar", w_bar)
def _make_params(self): w = getattr(self.module, self.name) # height = dout , width = din * w * h height = w.data.shape[0] width = w.view(height, -1).data.shape[1] # 使用type_as会让数据类型和cuda/cpu与该变量相同 u = Parameter(t.randn(height), requires_grad=False) v = Parameter(t.randn(width), requires_grad=False) u.data = self.l2normalize(u.data) v.data = self.l2normalize(v.data) # 这里w是parameter类别的,但是data是Tensor类别, 默认是会有梯度的 w_real = Parameter(w.data) # python的引用机制不一样, 即便删掉了这个,也只是删去了一个引用而已 # w还在引用原来的变量,并且会随BP而更新变量值 del self.module._parameters[self.name] # Adds a parameter to the module. self.module.register_parameter(self.name + '_u', u) self.module.register_parameter(self.name + '_v', v) self.module.register_parameter(self.name + '_matrices', w_real)
def _make_params(self): # print(self.module) weight = getattr(self.module, self.name) print(type(weight.data)) h, w = weight.size() # height = w.data.shape[0] # width = w.view(height, -1).data.shape[1] # print(type(w)) u = Parameter(weight.data.new(h).normal_(0, 1), requires_grad=False) v = Parameter(weight.data.new(w).normal_(0, 1), requires_grad=False) # u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=False) # v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=False) u.data = l2normalize(u.data) v.data = l2normalize(v.data) w_bar = Parameter(w.data) # print(w.data) del self.module._parameters[self.name] self.module.register_parameter(self.name + "_u", u) self.module.register_parameter(self.name + "_v", v) self.module.register_parameter(self.name + "_bar", w_bar)
def _make_params(self, w_init): w = getattr(self.module, self.name) # tbd initialization here, He initialization for relu if w_init: w.data.normal_(0.0, 0.1) w.data = init.kaiming_normal_(w.data) height = w.data.shape[0] width = w.view(height, -1).data.shape[1] u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=False) v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=False) u.data = l2normalize(u.data) v.data = l2normalize(v.data) w_bar = Parameter(w.data) del self.module._parameters[self.name] self.module.register_parameter(self.name + "_u", u) self.module.register_parameter(self.name + "_v", v) self.module.register_parameter(self.name + "_bar", w_bar)
def _make_params(self): #get the weight from the conv layer (module) w = getattr(self.module, self.name) height = w.data.shape[0] #flatten weight matrix except the batch axis width = w.view(height, -1).data.shape[1] #initialize random vectors from isotropic distribution u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=False) v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=False) u.data = tl.l2normalize(u.data) v.data = tl.l2normalize(v.data) w_bar = Parameter(w.data) #delete the original weight del self.module._parameters[self.name] #store the vectors into the module as parameters self.module.register_parameter(self.name + "_u", u) self.module.register_parameter(self.name + "_v", v) self.module.register_parameter(self.name + "_bar", w_bar)
def _make_params(self): """ No need to change. Initialize parameters. v: Initialize v with a random vector (sampled from isotropic distrition). u: Initialize u with a random vector (sampled from isotropic distrition). w: Weight of the current layer. """ w = getattr(self.module, self.name) height = w.data.shape[0] width = w.view(height, -1).data.shape[1] u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=False) v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=False) u.data = l2normalize(u.data) v.data = l2normalize(v.data) w_bar = Parameter(w.data) del self.module._parameters[self.name] self.module.register_parameter(self.name + "_u", u) self.module.register_parameter(self.name + "_v", v) self.module.register_parameter(self.name + "_bar", w_bar)
def orthonormal_init(param: nn.Parameter, n_blocks: int): size0, size1 = param.size() size0 //= n_blocks size_min = min(size0, size1) init_values = [] for _ in range(n_blocks): m1 = torch.randn(size0, size0, dtype=param.dtype) m2 = torch.randn(size1, size1, dtype=param.dtype) q1, r1 = torch.qr(m1) q2, r2 = torch.qr(m2) q1 *= torch.sign(torch.diag(r1)) q2 *= torch.sign(torch.diag(r2)) value = torch.mm(q1[:, :size_min], q2[:size_min, :]) init_values.append(value) param.data = torch.cat(init_values, dim=0)
def assign(param: nn.Parameter, weight: Union[str, torch.Tensor], trans_fn: Optional[TransFn] = None, allow_fail: bool = False): param_key = next(k for k, v in to_params.items() if v is param) del to_params[param_key] # delete regardless of whether weight exists if isinstance(weight, str): try: weight = get_weight(weight) except KeyError: if allow_fail: print(f"Weight {weight} not found in checkpoint") return else: raise if trans_fn is not None: weight = trans_fn(weight).contiguous() if param.size() != weight.size(): raise ValueError(f"Expected size {param.size()}, " f"actual size {weight.size()}") param.data = weight
def split_it(self, A, n_to_fix, n_to_learn, zero_fixed_part=False): """Splits a weight matrix to two parts of given sizes and returns each part as a Parameter.""" A_fixed = None A_learn = None if n_to_fix == 0: # all learnable A_learn = Parameter(A) A_learn.requires_grad = True A = [A_learn] elif n_to_learn == 0: # all fixed A_fixed = Parameter(A) A_fixed.requires_grad = False A = [A_fixed] else: A_fixed = Parameter(A[:n_to_fix]) A_learn = Parameter(A[n_to_fix:]) A_learn.requires_grad = True A = [A_fixed, A_learn] if zero_fixed_part and n_to_fix > 0: A_fixed.data = 0 * A_fixed.data if A_fixed is not None: A_fixed.requires_grad = False return A, A_fixed, A_learn
def _init_param_attributes(self, p: Parameter) -> None: """ We manage several attributes on each Parameter instance. The first two are set by :func:`_shard_parameters_`: ``_is_sharded``: ``True`` if the Parameter is sharded or ``False`` if the Parameter is intentionally not sharded (in which case we will all-reduce grads for this param). ``_orig_size``: the size of the original Parameter (before sharding) The remaining attributes are set here: ``_fp32_shard``: a single shard of the parameters in full precision (typically FP32, but this is dependent on the dtype of the model as it's passed in by the user). This can be on CPU or GPU depending on the value of *``cpu_offload``*. ``_fp16_shard``: if *``mixed_precision``* is ``True``, this will be a single shard of the parameters in FP16, used for all-gather. ``_full_param_padded``: the full weight (padded to be evenly divisible by ``world_size``), used for computation in the forward and backward pass. This will be resized in place and only materialized (via all-gather) as needed. """ assert hasattr(p, "_is_sharded") and hasattr(p, "_orig_size") if hasattr(p, "_fp32_shard"): return # Compute device defaults to CUDA when *cpu_offload* is enabled, or the # param's current device otherwise (could be CPU). compute_device = torch.device("cuda") if self.cpu_offload else p.device # A single shard of the parameters in full precision. p._fp32_shard = p.data if self.mixed_precision: assert p._fp32_shard.dtype == torch.float32 if self.cpu_offload: assert p._fp32_shard.device == torch.device("cpu") # If we plan to keep the FP32 parameters on CPU, then pinning # memory allows us to later use non-blocking transfers when moving # the FP32 param shard to compute_device. p._fp32_shard = p._fp32_shard.pin_memory() p.data = p._fp32_shard # In mixed precision mode, we maintain a reduced precision # (typically FP16) parameter shard on compute_device for performing # the computation in the forward/backward pass. We resize the # storage to size 0 at init (here) and re-materialize (by copying # from _fp32_shard) as needed. p._fp16_shard = torch.zeros_like(p._fp32_shard, device=compute_device, dtype=self.compute_dtype) free_storage_(p._fp16_shard) else: p._fp16_shard = None # use _fp32_shard # We also maintain a full-sized parameter of type self.compute_dtype # (FP16 for mixed_precision or FP32 otherwise). We resize the # storage to size 0 at init (here) and only materialize as needed. The # storage may contain padding elements so that it is evenly divisible by # world_size, although these padding elements will be removed before the # relevant computation. if p._is_sharded: p._full_param_padded = torch.zeros(p.data.numel() * self.world_size, device=compute_device, dtype=self.compute_dtype) free_storage_(p._full_param_padded) if self.move_grads_to_cpu: # We can optionally move the grad shard to CPU during the backward # pass. In this case, it's important to pre-allocate the CPU grad # shard in pinned memory so that we can do a non-blocking transfer. p._cpu_grad = torch.zeros_like(p.data, device="cpu").pin_memory()
def attack(model, criterion, img, label, eps, attack_type, iters, clean_clean_img=None): assert not model.training adv = img.clone().detach() adv = Parameter(adv, requires_grad=True) if attack_type == 'fgsm': iterations = 1 else: iterations = iters if attack_type == 'pgd': step = 2 / 255 else: step = eps / iterations noise = 0 for j in range(iterations): outputs = None if aug_test is None: out_adv = model(normalize(adv.clone())) loss = criterion(out_adv, label) loss.backward() else: adv_aux = adv * (1.0 - aug_test_lambda) for i in range( aug_test ): # TODO Check why this uses so much memory... it ain't normal fam adv_aux = adv_aux + aug_test_lambda * clean_clean_img[ torch.randperm(label.size(0))] out = model(normalize(adv_aux)) if outputs is None: outputs = out else: outputs += out out_adv = outputs / aug_test loss = criterion(out_adv, label) loss.backward() if attack_type == 'mim': adv_mean = torch.mean(torch.abs(adv.grad), dim=1, keepdim=True) adv_mean = torch.mean(torch.abs(adv_mean), dim=2, keepdim=True) adv_mean = torch.mean(torch.abs(adv_mean), dim=3, keepdim=True) adv.grad = adv.grad / adv_mean noise = noise + adv.grad else: assert adv.grad is not None noise = adv.grad # Optimization step adv.data = adv.data + step * noise.sign() # adv.data = adv.data + step * adv.grad.sign() if attack_type == 'pgd': adv.data = torch.where(adv.data > img.data + eps, img.data + eps, adv.data) adv.data = torch.where(adv.data < img.data - eps, img.data - eps, adv.data) adv.data.clamp_(0.0, 1.0) adv.grad.data.zero_() return adv.detach()
def _compress_module_param_dim( param: Parameter, target_dim: int, idxs_to_keep: Tensor, module: Optional[Module] = None, optimizer: Optional[Optimizer] = None, ): if param.dim() == 1: target_dim = 0 if param.size(target_dim) == 1 and idxs_to_keep.numel() > 1: # DW Conv return if param.size(target_dim) % idxs_to_keep.size(0) != 0: _LOGGER.debug( "skipping compression of parameter due to shape incompatibility") stride = param.data.size(target_dim) // idxs_to_keep.size(0) if stride > 1: idxs_to_keep = idxs_to_keep.reshape(-1, 1).expand(-1, stride).reshape(-1) param.data = (param.data[idxs_to_keep, ...] if target_dim == 0 else param.data[:, idxs_to_keep, ...]) if param.grad is not None: param.grad = (param.grad[idxs_to_keep, ...] if target_dim == 0 else param.grad[:, idxs_to_keep, ...]) if (optimizer is not None and param in optimizer.state and ("momentum_buffer" in optimizer.state[param])): optimizer.state[param]["momentum_buffer"] = ( optimizer.state[param]["momentum_buffer"][idxs_to_keep, ...] if target_dim == 0 else optimizer.state[param]["momentum_buffer"][:, idxs_to_keep, ...]) # update module attrs if module is not None: # Batch Norm if param.dim() == 1: if hasattr(module, "num_features"): module.num_features = param.size(0) # BN running mean and var are not stored as Parameters so we must # update them here if hasattr(module, "running_mean") and (module.running_mean.size(0) == idxs_to_keep.size(0)): module.running_mean = module.running_mean[idxs_to_keep] if hasattr(module, "running_var") and (module.running_var.size(0) == idxs_to_keep.size(0)): module.running_var = module.running_var[idxs_to_keep] # Linear elif target_dim == 0 and hasattr(module, "out_features"): module.out_features = param.size(0) elif target_dim == 1 and hasattr(module, "in_features"): module.in_features = param.size(1) # Conv elif target_dim == 0 and hasattr(module, "out_channels"): module.out_channels = param.size(0) elif target_dim == 1 and hasattr(module, "in_channels"): module.in_channels = param.size(1) if (hasattr(module, "groups") and module.groups > 1 and (hasattr(module, "out_channels") and hasattr(module, "in_channels"))): module.groups = param.size(0) // param.size(1)
def clamp_min_parameter(parameter: nn.Parameter, min_value: float) -> None: parameter.data = parameter.data.clamp_min(min_value)