def setUp_clipped_model(self, clip_value=0.003, run_clipper_step=True): # Deep copy self.clipped_model = SampleConvNet() # create the structure self.clipped_model.load_state_dict( self.original_model.state_dict()) # fill it # Intentionally clipping to a very small value norm_clipper = (ConstantFlatClipper(clip_value) if not isinstance(clip_value, list) else ConstantPerLayerClipper(clip_value)) self.clipper = PerSampleGradientClipper( self.clipped_model, norm_clipper, ) for x, y in self.dl: logits = self.clipped_model(x) loss = self.criterion(logits, y) loss.backward() # puts grad in self.clipped_model.parameters() if run_clipper_step: self.clipper.clip_and_accumulate() self.clipper.pre_step() self.clipped_grads_norms = torch.stack( [ p.grad.norm() for p in self.clipped_model.parameters() if p.requires_grad ], dim=-1, )
def _check_one_layer(self, layer, input): if hasattr(layer, "autograd_grad_sample_hooks"): raise ValueError(f"Input layer already has hooks attached." f"Please provide freshly constructed layer") nn.init.uniform_(layer.weight) nn.init.uniform_(layer.bias) output = layer(input) output.norm().backward() vanilla_run_grads = [ p.grad.detach().clone() for p in layer.parameters() if p.requires_grad ] clipper = PerSampleGradientClipper(layer, 999) output = layer(input) output.norm().backward() clipper.step() private_run_grads = [ p.grad.detach().clone() for p in layer.parameters() if p.requires_grad ] for vanilla_grad, private_grad in zip(vanilla_run_grads, private_run_grads): self.assertTrue( torch.allclose(vanilla_grad, private_grad, atol=10e-5, rtol=10e-3))
def _check_one_layer(self, layer, *args, **kwargs): if hasattr(layer, "autograd_grad_sample_hooks"): raise ValueError(f"Input layer already has hooks attached." f"Please provide freshly constructed layer") self.validator.validate(layer) if hasattr(layer, "weight"): nn.init.uniform_(layer.weight) if hasattr(layer, "bias"): nn.init.uniform_(layer.bias) # run without DP self._reset_seeds() layer.zero_grad() output = layer(*args) if isinstance(output, tuple): output = output[0] output.norm().backward() vanilla_run_grads = [ p.grad.detach().clone() for p in layer.parameters() if p.requires_grad ] # run with DP clipper = PerSampleGradientClipper(layer, 999, batch_dim=kwargs.get( "batch_dim", 0)) self._reset_seeds() layer.zero_grad() output = layer(*args) if isinstance(output, tuple): output = output[0] output.norm().backward() for param_name, param in layer.named_parameters(): if param.requires_grad: self.assertTrue( hasattr(param, "grad_sample"), f"Per-sample gradients hasn't been computed for {param_name}", ) clipper.step() private_run_grads = [ p.grad.detach().clone() for p in layer.parameters() if p.requires_grad ] # compare for vanilla_grad, private_grad in zip(vanilla_run_grads, private_run_grads): self.assertTrue( torch.allclose(vanilla_grad, private_grad, atol=10e-5, rtol=10e-3))
def _check_one_layer_with_criterion(self, layer, criterion, *args, **kwargs): self.validator.validate(layer) for name, param in layer.named_parameters(): if ("weight" in name) or ("bias" in name): nn.init.uniform_(param, -1.0, 1.0) # run without DP self._run_once(layer, criterion, *args) vanilla_run_grads = [ (name, p.grad.detach()) for (name, p) in layer.named_parameters() if p.requires_grad ] # run with DP clipper = PerSampleGradientClipper( layer, ConstantFlatClipper(1e9), batch_first=kwargs.get("batch_first", True), loss_reduction=criterion.reduction, ) self._run_once(layer, criterion, *args) for param_name, param in layer.named_parameters(): if param.requires_grad: self.assertTrue( hasattr(param, "grad_sample"), f"Per-sample gradients haven't been computed for {param_name}", ) clipper.clip_and_accumulate() clipper.pre_step() private_run_grads = [ (name, p.grad.detach()) for (name, p) in layer.named_parameters() if p.requires_grad ] # compare for (vanilla_name, vanilla_grad), (private_name, private_grad) in zip( vanilla_run_grads, private_run_grads ): assert vanilla_name == private_name self.assertTrue( torch.allclose(vanilla_grad, private_grad, atol=10e-5, rtol=10e-3), f"Gradient mismatch. Parameter: {layer}.{vanilla_name}, loss: {criterion.reduction}", ) clipper.close()
def setUp_clipped_model(self, clip_value=0.003, run_clipper_step=True): # Deep copy self.clipped_model = SampleConvNet() # create the structure self.clipped_model.load_state_dict( self.original_model.state_dict()) # fill it # Intentionally clipping to a very small value self.clipper = PerSampleGradientClipper(self.clipped_model, clip_value) for x, y in self.dl: logits = self.clipped_model(x) loss = self.criterion(logits, y) loss.backward() # puts grad in self.clipped_model.parameters() if run_clipper_step: self.clipper.step() self.clipped_grads_norms = torch.stack( [p.grad.norm() for p in self.clipped_model.parameters()], dim=-1)
def _check_one_layer_with_criterion(self, layer, criterion, data, batch_first=True): clipper = PerSampleGradientClipper(layer, ConstantFlatClipper(1e9), batch_first=batch_first, loss_reduction=criterion.reduction) self._run_once(layer, criterion, data) computed_sample_grads = {} for (param_name, param) in layer.named_parameters(): computed_sample_grads[param_name] = param.grad_sample.detach() clipper.clip_and_accumulate() clipper.pre_step() clipper.close() batch_dim = 0 if batch_first else 1 data = data.transpose(0, batch_dim) for i, sample in enumerate(data): # simulate batch_size = 1 sample_data = sample.unsqueeze(batch_dim) self._run_once(layer, criterion, sample_data) for (param_name, param) in layer.named_parameters(): # grad we just computed with batch_size = 1 vanilla_per_sample_grad = param.grad # i-th line in grad_sample computed before computed_per_sample_grad = computed_sample_grads[param_name][i] self.assertTrue( torch.allclose( vanilla_per_sample_grad, computed_per_sample_grad, atol=10e-5, rtol=10e-3, ), f"Gradient mismatch. Parameter: {layer}.{param_name}, loss: {criterion.reduction}", )
def _check_one_layer(self, layer, *args, **kwargs): if hasattr(layer, "autograd_grad_sample_hooks"): raise ValueError(f"Input layer already has hooks attached." f"Please provide freshly constructed layer") self.validator.validate(layer) for name, param in layer.named_parameters(): if ("weight" in name) or ("bias" in name): nn.init.uniform_(param, -1.0, 1.0) # run without DP self._reset_seeds() layer.zero_grad() output = layer(*args) if isinstance(output, tuple): output = output[0] output.norm().backward() vanilla_run_grads = [ p.grad.detach().clone() for p in layer.parameters() if p.requires_grad ] # run with DP clipper = PerSampleGradientClipper(layer, ConstantFlatClipper(1999), kwargs.get('batch_first', True)) # The test outcome is sensitive to the threshold here. This test verifies that our backward # hooks populate sample specific gradients correctly and that when aggregated, they agree # with the aggregated gradients from vanilla pytorch. The clipper currently does both clipping # and aggregation, whereas we only want to do the latter. As a work around, we simply set a very # high threshold for clipping so that it effectively becomes a no-op. self._reset_seeds() layer.zero_grad() output = layer(*args) if isinstance(output, tuple): output = output[0] output.norm().backward() for param_name, param in layer.named_parameters(): if param.requires_grad: self.assertTrue( hasattr(param, "grad_sample"), f"Per-sample gradients hasn't been computed for {param_name}", ) clipper.clip_and_accumulate() clipper.pre_step() private_run_grads = [ p.grad.detach().clone() for p in layer.parameters() if p.requires_grad ] # compare for vanilla_grad, private_grad in zip(vanilla_run_grads, private_run_grads): self.assertTrue( torch.allclose(vanilla_grad, private_grad, atol=10e-5, rtol=10e-3))