def setUp_clipped_model(self, clip_value=0.003, run_clipper_step=True):
        # Deep copy
        self.clipped_model = SampleConvNet()  # create the structure
        self.clipped_model.load_state_dict(
            self.original_model.state_dict())  # fill it

        # Intentionally clipping to a very small value
        norm_clipper = (ConstantFlatClipper(clip_value)
                        if not isinstance(clip_value, list) else
                        ConstantPerLayerClipper(clip_value))
        self.clipper = PerSampleGradientClipper(
            self.clipped_model,
            norm_clipper,
        )

        for x, y in self.dl:
            logits = self.clipped_model(x)
            loss = self.criterion(logits, y)
            loss.backward()  # puts grad in self.clipped_model.parameters()
            if run_clipper_step:
                self.clipper.clip_and_accumulate()
                self.clipper.pre_step()
        self.clipped_grads_norms = torch.stack(
            [
                p.grad.norm()
                for p in self.clipped_model.parameters() if p.requires_grad
            ],
            dim=-1,
        )
示例#2
0
    def _check_one_layer(self, layer, input):
        if hasattr(layer, "autograd_grad_sample_hooks"):
            raise ValueError(f"Input layer already has hooks attached."
                             f"Please provide freshly constructed layer")

        nn.init.uniform_(layer.weight)
        nn.init.uniform_(layer.bias)

        output = layer(input)
        output.norm().backward()
        vanilla_run_grads = [
            p.grad.detach().clone() for p in layer.parameters()
            if p.requires_grad
        ]

        clipper = PerSampleGradientClipper(layer, 999)
        output = layer(input)
        output.norm().backward()
        clipper.step()
        private_run_grads = [
            p.grad.detach().clone() for p in layer.parameters()
            if p.requires_grad
        ]

        for vanilla_grad, private_grad in zip(vanilla_run_grads,
                                              private_run_grads):
            self.assertTrue(
                torch.allclose(vanilla_grad,
                               private_grad,
                               atol=10e-5,
                               rtol=10e-3))
示例#3
0
    def _check_one_layer(self, layer, *args, **kwargs):
        if hasattr(layer, "autograd_grad_sample_hooks"):
            raise ValueError(f"Input layer already has hooks attached."
                             f"Please provide freshly constructed layer")

        self.validator.validate(layer)
        if hasattr(layer, "weight"):
            nn.init.uniform_(layer.weight)
        if hasattr(layer, "bias"):
            nn.init.uniform_(layer.bias)

        # run without DP
        self._reset_seeds()
        layer.zero_grad()
        output = layer(*args)
        if isinstance(output, tuple):
            output = output[0]
        output.norm().backward()
        vanilla_run_grads = [
            p.grad.detach().clone() for p in layer.parameters()
            if p.requires_grad
        ]

        # run with DP
        clipper = PerSampleGradientClipper(layer,
                                           999,
                                           batch_dim=kwargs.get(
                                               "batch_dim", 0))
        self._reset_seeds()
        layer.zero_grad()
        output = layer(*args)
        if isinstance(output, tuple):
            output = output[0]
        output.norm().backward()

        for param_name, param in layer.named_parameters():
            if param.requires_grad:
                self.assertTrue(
                    hasattr(param, "grad_sample"),
                    f"Per-sample gradients hasn't been computed for {param_name}",
                )

        clipper.step()

        private_run_grads = [
            p.grad.detach().clone() for p in layer.parameters()
            if p.requires_grad
        ]

        # compare
        for vanilla_grad, private_grad in zip(vanilla_run_grads,
                                              private_run_grads):
            self.assertTrue(
                torch.allclose(vanilla_grad,
                               private_grad,
                               atol=10e-5,
                               rtol=10e-3))
示例#4
0
    def _check_one_layer_with_criterion(self, layer, criterion, *args, **kwargs):
        self.validator.validate(layer)
        for name, param in layer.named_parameters():
            if ("weight" in name) or ("bias" in name):
                nn.init.uniform_(param, -1.0, 1.0)

        # run without DP
        self._run_once(layer, criterion, *args)
        vanilla_run_grads = [
            (name, p.grad.detach())
            for (name, p) in layer.named_parameters()
            if p.requires_grad
        ]

        # run with DP
        clipper = PerSampleGradientClipper(
            layer,
            ConstantFlatClipper(1e9),
            batch_first=kwargs.get("batch_first", True),
            loss_reduction=criterion.reduction,
        )
        self._run_once(layer, criterion, *args)

        for param_name, param in layer.named_parameters():
            if param.requires_grad:
                self.assertTrue(
                    hasattr(param, "grad_sample"),
                    f"Per-sample gradients haven't been computed for {param_name}",
                )

        clipper.clip_and_accumulate()
        clipper.pre_step()

        private_run_grads = [
            (name, p.grad.detach())
            for (name, p) in layer.named_parameters()
            if p.requires_grad
        ]

        # compare
        for (vanilla_name, vanilla_grad), (private_name, private_grad) in zip(
            vanilla_run_grads, private_run_grads
        ):
            assert vanilla_name == private_name

            self.assertTrue(
                torch.allclose(vanilla_grad, private_grad, atol=10e-5, rtol=10e-3),
                f"Gradient mismatch. Parameter: {layer}.{vanilla_name}, loss: {criterion.reduction}",
            )

        clipper.close()
    def setUp_clipped_model(self, clip_value=0.003, run_clipper_step=True):
        # Deep copy
        self.clipped_model = SampleConvNet()  # create the structure
        self.clipped_model.load_state_dict(
            self.original_model.state_dict())  # fill it

        # Intentionally clipping to a very small value
        self.clipper = PerSampleGradientClipper(self.clipped_model, clip_value)
        for x, y in self.dl:
            logits = self.clipped_model(x)
            loss = self.criterion(logits, y)
            loss.backward()  # puts grad in self.clipped_model.parameters()
            if run_clipper_step:
                self.clipper.step()
        self.clipped_grads_norms = torch.stack(
            [p.grad.norm() for p in self.clipped_model.parameters()], dim=-1)
示例#6
0
    def _check_one_layer_with_criterion(self,
                                        layer,
                                        criterion,
                                        data,
                                        batch_first=True):
        clipper = PerSampleGradientClipper(layer,
                                           ConstantFlatClipper(1e9),
                                           batch_first=batch_first,
                                           loss_reduction=criterion.reduction)
        self._run_once(layer, criterion, data)

        computed_sample_grads = {}
        for (param_name, param) in layer.named_parameters():
            computed_sample_grads[param_name] = param.grad_sample.detach()

        clipper.clip_and_accumulate()
        clipper.pre_step()
        clipper.close()

        batch_dim = 0 if batch_first else 1
        data = data.transpose(0, batch_dim)
        for i, sample in enumerate(data):
            # simulate batch_size = 1
            sample_data = sample.unsqueeze(batch_dim)
            self._run_once(layer, criterion, sample_data)

            for (param_name, param) in layer.named_parameters():
                # grad we just computed with batch_size = 1
                vanilla_per_sample_grad = param.grad

                # i-th line in grad_sample computed before
                computed_per_sample_grad = computed_sample_grads[param_name][i]

                self.assertTrue(
                    torch.allclose(
                        vanilla_per_sample_grad,
                        computed_per_sample_grad,
                        atol=10e-5,
                        rtol=10e-3,
                    ),
                    f"Gradient mismatch. Parameter: {layer}.{param_name}, loss: {criterion.reduction}",
                )
示例#7
0
    def _check_one_layer(self, layer, *args, **kwargs):
        if hasattr(layer, "autograd_grad_sample_hooks"):
            raise ValueError(f"Input layer already has hooks attached."
                             f"Please provide freshly constructed layer")

        self.validator.validate(layer)
        for name, param in layer.named_parameters():
            if ("weight" in name) or ("bias" in name):
                nn.init.uniform_(param, -1.0, 1.0)

        # run without DP
        self._reset_seeds()
        layer.zero_grad()
        output = layer(*args)
        if isinstance(output, tuple):
            output = output[0]
        output.norm().backward()
        vanilla_run_grads = [
            p.grad.detach().clone() for p in layer.parameters()
            if p.requires_grad
        ]

        # run with DP
        clipper = PerSampleGradientClipper(layer, ConstantFlatClipper(1999),
                                           kwargs.get('batch_first', True))
        # The test outcome is sensitive to the threshold here. This test verifies that our backward
        # hooks populate sample specific gradients correctly and that when aggregated, they agree
        # with the aggregated gradients from vanilla pytorch. The clipper currently does both clipping
        # and aggregation, whereas we only want to do the latter. As a work around, we simply set a very
        # high threshold for clipping so that it effectively becomes a no-op.

        self._reset_seeds()
        layer.zero_grad()
        output = layer(*args)
        if isinstance(output, tuple):
            output = output[0]
        output.norm().backward()

        for param_name, param in layer.named_parameters():
            if param.requires_grad:
                self.assertTrue(
                    hasattr(param, "grad_sample"),
                    f"Per-sample gradients hasn't been computed for {param_name}",
                )

        clipper.clip_and_accumulate()
        clipper.pre_step()

        private_run_grads = [
            p.grad.detach().clone() for p in layer.parameters()
            if p.requires_grad
        ]

        # compare
        for vanilla_grad, private_grad in zip(vanilla_run_grads,
                                              private_run_grads):
            self.assertTrue(
                torch.allclose(vanilla_grad,
                               private_grad,
                               atol=10e-5,
                               rtol=10e-3))