def setUp_clipped_model(self, clip_value=0.003, run_clipper_step=True): # Deep copy self.clipped_model = SampleConvNet() # create the structure self.clipped_model.load_state_dict( self.original_model.state_dict()) # fill it # Intentionally clipping to a very small value norm_clipper = (ConstantFlatClipper(clip_value) if not isinstance(clip_value, list) else ConstantPerLayerClipper(clip_value)) self.clipper = PerSampleGradientClipper( self.clipped_model, norm_clipper, ) for x, y in self.dl: logits = self.clipped_model(x) loss = self.criterion(logits, y) loss.backward() # puts grad in self.clipped_model.parameters() if run_clipper_step: self.clipper.clip_and_accumulate() self.clipper.pre_step() self.clipped_grads_norms = torch.stack( [ p.grad.norm() for p in self.clipped_model.parameters() if p.requires_grad ], dim=-1, )
def _check_one_layer(self, layer, input): if hasattr(layer, "autograd_grad_sample_hooks"): raise ValueError(f"Input layer already has hooks attached." f"Please provide freshly constructed layer") nn.init.uniform_(layer.weight) nn.init.uniform_(layer.bias) output = layer(input) output.norm().backward() vanilla_run_grads = [ p.grad.detach().clone() for p in layer.parameters() if p.requires_grad ] clipper = PerSampleGradientClipper(layer, 999) output = layer(input) output.norm().backward() clipper.step() private_run_grads = [ p.grad.detach().clone() for p in layer.parameters() if p.requires_grad ] for vanilla_grad, private_grad in zip(vanilla_run_grads, private_run_grads): self.assertTrue( torch.allclose(vanilla_grad, private_grad, atol=10e-5, rtol=10e-3))
def _check_one_layer(self, layer, *args, **kwargs): if hasattr(layer, "autograd_grad_sample_hooks"): raise ValueError(f"Input layer already has hooks attached." f"Please provide freshly constructed layer") self.validator.validate(layer) if hasattr(layer, "weight"): nn.init.uniform_(layer.weight) if hasattr(layer, "bias"): nn.init.uniform_(layer.bias) # run without DP self._reset_seeds() layer.zero_grad() output = layer(*args) if isinstance(output, tuple): output = output[0] output.norm().backward() vanilla_run_grads = [ p.grad.detach().clone() for p in layer.parameters() if p.requires_grad ] # run with DP clipper = PerSampleGradientClipper(layer, 999, batch_dim=kwargs.get( "batch_dim", 0)) self._reset_seeds() layer.zero_grad() output = layer(*args) if isinstance(output, tuple): output = output[0] output.norm().backward() for param_name, param in layer.named_parameters(): if param.requires_grad: self.assertTrue( hasattr(param, "grad_sample"), f"Per-sample gradients hasn't been computed for {param_name}", ) clipper.step() private_run_grads = [ p.grad.detach().clone() for p in layer.parameters() if p.requires_grad ] # compare for vanilla_grad, private_grad in zip(vanilla_run_grads, private_run_grads): self.assertTrue( torch.allclose(vanilla_grad, private_grad, atol=10e-5, rtol=10e-3))
def setUp_clipped_model(self, clip_value=0.003, run_clipper_step=True): # Deep copy self.clipped_model = SampleConvNet() # create the structure self.clipped_model.load_state_dict( self.original_model.state_dict()) # fill it # Intentionally clipping to a very small value self.clipper = PerSampleGradientClipper(self.clipped_model, clip_value) for x, y in self.dl: logits = self.clipped_model(x) loss = self.criterion(logits, y) loss.backward() # puts grad in self.clipped_model.parameters() if run_clipper_step: self.clipper.step() self.clipped_grads_norms = torch.stack( [p.grad.norm() for p in self.clipped_model.parameters()], dim=-1)
def _check_one_layer_with_criterion(self, layer, criterion, *args, **kwargs): self.validator.validate(layer) for name, param in layer.named_parameters(): if ("weight" in name) or ("bias" in name): nn.init.uniform_(param, -1.0, 1.0) # run without DP self._run_once(layer, criterion, *args) vanilla_run_grads = [ (name, p.grad.detach()) for (name, p) in layer.named_parameters() if p.requires_grad ] # run with DP clipper = PerSampleGradientClipper( layer, ConstantFlatClipper(1e9), batch_first=kwargs.get("batch_first", True), loss_reduction=criterion.reduction, ) self._run_once(layer, criterion, *args) for param_name, param in layer.named_parameters(): if param.requires_grad: self.assertTrue( hasattr(param, "grad_sample"), f"Per-sample gradients haven't been computed for {param_name}", ) clipper.clip_and_accumulate() clipper.pre_step() private_run_grads = [ (name, p.grad.detach()) for (name, p) in layer.named_parameters() if p.requires_grad ] # compare for (vanilla_name, vanilla_grad), (private_name, private_grad) in zip( vanilla_run_grads, private_run_grads ): assert vanilla_name == private_name self.assertTrue( torch.allclose(vanilla_grad, private_grad, atol=10e-5, rtol=10e-3), f"Gradient mismatch. Parameter: {layer}.{vanilla_name}, loss: {criterion.reduction}", ) clipper.close()
def _check_one_layer_with_criterion(self, layer, criterion, data, batch_first=True): clipper = PerSampleGradientClipper(layer, ConstantFlatClipper(1e9), batch_first=batch_first, loss_reduction=criterion.reduction) self._run_once(layer, criterion, data) computed_sample_grads = {} for (param_name, param) in layer.named_parameters(): computed_sample_grads[param_name] = param.grad_sample.detach() clipper.clip_and_accumulate() clipper.pre_step() clipper.close() batch_dim = 0 if batch_first else 1 data = data.transpose(0, batch_dim) for i, sample in enumerate(data): # simulate batch_size = 1 sample_data = sample.unsqueeze(batch_dim) self._run_once(layer, criterion, sample_data) for (param_name, param) in layer.named_parameters(): # grad we just computed with batch_size = 1 vanilla_per_sample_grad = param.grad # i-th line in grad_sample computed before computed_per_sample_grad = computed_sample_grads[param_name][i] self.assertTrue( torch.allclose( vanilla_per_sample_grad, computed_per_sample_grad, atol=10e-5, rtol=10e-3, ), f"Gradient mismatch. Parameter: {layer}.{param_name}, loss: {criterion.reduction}", )
def _check_one_layer(self, layer, *args, **kwargs): if hasattr(layer, "autograd_grad_sample_hooks"): raise ValueError(f"Input layer already has hooks attached." f"Please provide freshly constructed layer") self.validator.validate(layer) for name, param in layer.named_parameters(): if ("weight" in name) or ("bias" in name): nn.init.uniform_(param, -1.0, 1.0) # run without DP self._reset_seeds() layer.zero_grad() output = layer(*args) if isinstance(output, tuple): output = output[0] output.norm().backward() vanilla_run_grads = [ p.grad.detach().clone() for p in layer.parameters() if p.requires_grad ] # run with DP clipper = PerSampleGradientClipper(layer, ConstantFlatClipper(1999), kwargs.get('batch_first', True)) # The test outcome is sensitive to the threshold here. This test verifies that our backward # hooks populate sample specific gradients correctly and that when aggregated, they agree # with the aggregated gradients from vanilla pytorch. The clipper currently does both clipping # and aggregation, whereas we only want to do the latter. As a work around, we simply set a very # high threshold for clipping so that it effectively becomes a no-op. self._reset_seeds() layer.zero_grad() output = layer(*args) if isinstance(output, tuple): output = output[0] output.norm().backward() for param_name, param in layer.named_parameters(): if param.requires_grad: self.assertTrue( hasattr(param, "grad_sample"), f"Per-sample gradients hasn't been computed for {param_name}", ) clipper.clip_and_accumulate() clipper.pre_step() private_run_grads = [ p.grad.detach().clone() for p in layer.parameters() if p.requires_grad ] # compare for vanilla_grad, private_grad in zip(vanilla_run_grads, private_run_grads): self.assertTrue( torch.allclose(vanilla_grad, private_grad, atol=10e-5, rtol=10e-3))
class PerSampleGradientClipper_test(unittest.TestCase): def setUp(self): self.DATA_SIZE = 64 self.criterion = nn.CrossEntropyLoss() self.setUp_data() self.setUp_original_model() self.setUp_clipped_model(clip_value=0.003, run_clipper_step=True) def setUp_data(self): self.ds = FakeData( size=self.DATA_SIZE, image_size=(1, 35, 35), num_classes=10, transform=transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] ), ) self.dl = DataLoader(self.ds, batch_size=self.DATA_SIZE) def setUp_original_model(self): self.original_model = SampleConvNet() for x, y in self.dl: logits = self.original_model(x) loss = self.criterion(logits, y) loss.backward() # puts grad in self.original_model.parameters() self.original_grads_norms = torch.stack( [ p.grad.norm() for p in self.original_model.parameters() if p.requires_grad ], dim=-1, ) def setUp_clipped_model(self, clip_value=0.003, run_clipper_step=True): # Deep copy self.clipped_model = SampleConvNet() # create the structure self.clipped_model.load_state_dict(self.original_model.state_dict()) # fill it # Intentionally clipping to a very small value norm_clipper = ( ConstantFlatClipper(clip_value) if not isinstance(clip_value, list) else ConstantPerLayerClipper(clip_value) ) self.clipper = PerSampleGradientClipper(self.clipped_model, norm_clipper) for x, y in self.dl: logits = self.clipped_model(x) loss = self.criterion(logits, y) loss.backward() # puts grad in self.clipped_model.parameters() if run_clipper_step: self.clipper.clip_and_accumulate() self.clipper.pre_step() self.clipped_grads_norms = torch.stack( [p.grad.norm() for p in self.clipped_model.parameters() if p.requires_grad], dim=-1, ) def test_clipped_grad_norm_is_smaller(self): """ Test that grad are clipped and their value changes """ for original_layer_norm, clipped_layer_norm in zip( self.original_grads_norms, self.clipped_grads_norms ): self.assertLess(float(clipped_layer_norm), float(original_layer_norm)) def test_clipped_grad_norm_is_smaller_perlayer(self): """ Test that grad are clipped and their value changes """ # there are 8 parameter sets [bias, weight] * [conv1, conv2, fc1, fc2] self.setUp_clipped_model(clip_value=[0.001] * 8) for original_layer_norm, clipped_layer_norm in zip( self.original_grads_norms, self.clipped_grads_norms ): self.assertLess(float(clipped_layer_norm), float(original_layer_norm)) def test_clipped_grad_norms_not_zero(self): """ Test that grads aren't killed by clipping """ allzeros = torch.zeros_like(self.clipped_grads_norms) self.assertFalse(torch.allclose(self.clipped_grads_norms, allzeros)) def test_clipped_grad_norms_not_zero_per_layer(self): """ Test that grads aren't killed by clipping """ # there are 8 parameter sets [bias, weight] * [conv1, conv2, fc1, fc2] self.setUp_clipped_model(clip_value=[0.001] * 8) allzeros = torch.zeros_like(self.clipped_grads_norms) self.assertFalse(torch.allclose(self.clipped_grads_norms, allzeros)) def test_clipping_to_high_value_does_nothing(self): self.setUp_clipped_model( clip_value=9999, run_clipper_step=True ) # should be a no-op self.assertTrue( torch.allclose(self.original_grads_norms, self.clipped_grads_norms) ) def test_clipping_to_high_value_does_nothing_per_layer(self): self.setUp_clipped_model( clip_value=[9999] * 8, run_clipper_step=True ) # should be a no-op self.assertTrue( torch.allclose(self.original_grads_norms, self.clipped_grads_norms) ) def test_grad_norms_untouched_without_clip_step(self): """ Test that grad are not clipped until clipper.step() is called """ self.setUp_clipped_model(clip_value=0.003, run_clipper_step=False) self.assertTrue( torch.allclose(self.original_grads_norms, self.clipped_grads_norms) )