예제 #1
0
    def test_dp_model_inspector_example(self):
        # IMPORTANT: When changing this code you also need to update
        # the docstring for torchdp.dp_model_inspector.DPModelInspector.validate()

        inspector = DPModelInspector()
        valid_model = nn.Linear(16, 32)
        is_valid = inspector.validate(valid_model)
        self.assertTrue(is_valid)

        invalid_model = nn.BatchNorm1d(2)
        with self.assertRaises(IncompatibleModuleException):
            is_valid = inspector.validate(invalid_model)
예제 #2
0
 def setUp(self):
     self.validator = DPModelInspector()
예제 #3
0
class LayersGradTest(unittest.TestCase):
    def setUp(self):
        self.validator = DPModelInspector()

    def _reset_seeds(self):
        torch.manual_seed(1337)
        torch.cuda.manual_seed(1337)

    def _check_one_layer(self, layer, *args, **kwargs):
        if hasattr(layer, "autograd_grad_sample_hooks"):
            raise ValueError(f"Input layer already has hooks attached."
                             f"Please provide freshly constructed layer")

        self.validator.validate(layer)
        for name, param in layer.named_parameters():
            if ("weight" in name) or ("bias" in name):
                nn.init.uniform_(param, -1.0, 1.0)

        # run without DP
        self._reset_seeds()
        layer.zero_grad()
        output = layer(*args)
        if isinstance(output, tuple):
            output = output[0]
        output.norm().backward()
        vanilla_run_grads = [
            p.grad.detach().clone() for p in layer.parameters()
            if p.requires_grad
        ]

        # run with DP
        clipper = PerSampleGradientClipper(layer, ConstantFlatClipper(1999),
                                           kwargs.get('batch_first', True))
        # The test outcome is sensitive to the threshold here. This test verifies that our backward
        # hooks populate sample specific gradients correctly and that when aggregated, they agree
        # with the aggregated gradients from vanilla pytorch. The clipper currently does both clipping
        # and aggregation, whereas we only want to do the latter. As a work around, we simply set a very
        # high threshold for clipping so that it effectively becomes a no-op.

        self._reset_seeds()
        layer.zero_grad()
        output = layer(*args)
        if isinstance(output, tuple):
            output = output[0]
        output.norm().backward()

        for param_name, param in layer.named_parameters():
            if param.requires_grad:
                self.assertTrue(
                    hasattr(param, "grad_sample"),
                    f"Per-sample gradients hasn't been computed for {param_name}",
                )

        clipper.clip_and_accumulate()
        clipper.pre_step()

        private_run_grads = [
            p.grad.detach().clone() for p in layer.parameters()
            if p.requires_grad
        ]

        # compare
        for vanilla_grad, private_grad in zip(vanilla_run_grads,
                                              private_run_grads):
            self.assertTrue(
                torch.allclose(vanilla_grad,
                               private_grad,
                               atol=10e-5,
                               rtol=10e-3))

    def test_conv1d(self):
        x = torch.randn(64, 16, 24)
        layer = nn.Conv1d(16, 32, 3, 1)

        self._check_one_layer(layer, x)

    def test_conv2d(self):
        x = torch.randn(64, 16, 24, 24)
        layer = nn.Conv2d(16, 32, 3, 1)

        self._check_one_layer(layer, x)

    def test_linear(self):
        self._check_one_layer(nn.Linear(8, 4), torch.randn(16, 8))
        self._check_one_layer(nn.Linear(8, 4), torch.randn(16, 8, 8))

    def test_layernorm(self):
        x = torch.randn(64, 16, 24, 24)

        self._check_one_layer(nn.LayerNorm(24), x)
        self._check_one_layer(nn.LayerNorm((24, 24)), x)
        self._check_one_layer(nn.LayerNorm((16, 24, 24)), x)

    def test_groupnorm(self):
        self._check_one_layer(nn.GroupNorm(4, 16), torch.randn(64, 16, 10))
        self._check_one_layer(nn.GroupNorm(4, 16), torch.randn(64, 16, 10, 9))
        self._check_one_layer(nn.GroupNorm(4, 16),
                              torch.randn(64, 16, 10, 9, 8))

    def test_instancenorm(self):
        self._check_one_layer(nn.InstanceNorm1d(16, affine=True),
                              torch.randn(64, 16, 10))
        self._check_one_layer(nn.InstanceNorm2d(16, affine=True),
                              torch.randn(64, 16, 10, 9))
        self._check_one_layer(nn.InstanceNorm3d(16, affine=True),
                              torch.randn(64, 16, 10, 9, 8))

    def test_sequence_bias(self):
        x = torch.randn(4, 3, 2)
        layer = SequenceBias(2)

        self._check_one_layer(layer, x, batch_first=False)

    def test_multihead_attention(self):
        x = torch.randn(16, 24, 32)

        layer = DPMultiheadAttention(32, 1)
        self._check_one_layer(layer, x, x, x, batch_first=False)

        layer = DPMultiheadAttention(32,
                                     1,
                                     bias=True,
                                     add_bias_kv=True,
                                     dropout=0.05)
        self._check_one_layer(layer, x, x, x, batch_first=False)

        layer = DPMultiheadAttention(32, 1, bias=True, add_bias_kv=True)
        self._check_one_layer(layer, x, x, x, batch_first=False)

        layer = DPMultiheadAttention(32,
                                     1,
                                     bias=True,
                                     add_bias_kv=True,
                                     add_zero_attn=True)
        self._check_one_layer(layer, x, x, x, batch_first=False)

        q = torch.randn(16, 24, 32)
        k = torch.randn(20, 24, 28)
        v = torch.randn(20, 24, 28)
        layer = DPMultiheadAttention(32,
                                     1,
                                     bias=True,
                                     add_bias_kv=True,
                                     add_zero_attn=True,
                                     kdim=28,
                                     vdim=28)
        self._check_one_layer(layer, q, k, v, batch_first=False)

    def test_embedding(self):
        layer = nn.Embedding(256, 100)
        x1 = torch.randint(0, 255, (128, 42)).long()
        x2 = torch.randint(0, 255, (64, )).long()
        self._check_one_layer(layer, x1)
        self._check_one_layer(layer, x2)
예제 #4
0
class LayersGradTest(unittest.TestCase):
    def setUp(self):
        self.validator = DPModelInspector()

    def _reset_seeds(self):
        torch.manual_seed(1337)
        torch.cuda.manual_seed(1337)

    def _check_one_layer(self, layer, *args, **kwargs):
        if hasattr(layer, "autograd_grad_sample_hooks"):
            raise ValueError(f"Input layer already has hooks attached."
                             f"Please provide freshly constructed layer")

        self.validator.validate(layer)
        if hasattr(layer, "weight"):
            nn.init.uniform_(layer.weight)
        if hasattr(layer, "bias"):
            nn.init.uniform_(layer.bias)

        # run without DP
        self._reset_seeds()
        layer.zero_grad()
        output = layer(*args)
        if isinstance(output, tuple):
            output = output[0]
        output.norm().backward()
        vanilla_run_grads = [
            p.grad.detach().clone() for p in layer.parameters()
            if p.requires_grad
        ]

        # run with DP
        clipper = PerSampleGradientClipper(layer,
                                           999,
                                           batch_dim=kwargs.get(
                                               "batch_dim", 0))
        self._reset_seeds()
        layer.zero_grad()
        output = layer(*args)
        if isinstance(output, tuple):
            output = output[0]
        output.norm().backward()

        for param_name, param in layer.named_parameters():
            if param.requires_grad:
                self.assertTrue(
                    hasattr(param, "grad_sample"),
                    f"Per-sample gradients hasn't been computed for {param_name}",
                )

        clipper.step()

        private_run_grads = [
            p.grad.detach().clone() for p in layer.parameters()
            if p.requires_grad
        ]

        # compare
        for vanilla_grad, private_grad in zip(vanilla_run_grads,
                                              private_run_grads):
            self.assertTrue(
                torch.allclose(vanilla_grad,
                               private_grad,
                               atol=10e-5,
                               rtol=10e-3))

    def test_conv1d(self):
        x = torch.randn(64, 16, 24)
        layer = nn.Conv1d(16, 32, 3, 1)

        self._check_one_layer(layer, x)

    def test_conv2d(self):
        x = torch.randn(64, 16, 24, 24)
        layer = nn.Conv2d(16, 32, 3, 1)

        self._check_one_layer(layer, x)

    def test_linear(self):
        self._check_one_layer(nn.Linear(8, 4), torch.randn(16, 8))
        self._check_one_layer(nn.Linear(8, 4), torch.randn(16, 8, 8))

    def test_layernorm(self):
        x = torch.randn(64, 16, 24, 24)

        self._check_one_layer(nn.LayerNorm(24), x)
        self._check_one_layer(nn.LayerNorm((24, 24)), x)
        self._check_one_layer(nn.LayerNorm((16, 24, 24)), x)

    def test_groupnorm(self):
        self._check_one_layer(nn.GroupNorm(4, 16), torch.randn(64, 16, 10))
        self._check_one_layer(nn.GroupNorm(4, 16), torch.randn(64, 16, 10, 9))
        self._check_one_layer(nn.GroupNorm(4, 16),
                              torch.randn(64, 16, 10, 9, 8))

    def test_instancenorm(self):
        self._check_one_layer(nn.InstanceNorm1d(16, affine=True),
                              torch.randn(64, 16, 10))
        self._check_one_layer(nn.InstanceNorm2d(16, affine=True),
                              torch.randn(64, 16, 10, 9))
        self._check_one_layer(nn.InstanceNorm3d(16, affine=True),
                              torch.randn(64, 16, 10, 9, 8))

    def test_sequence_bias(self):
        x = torch.randn(4, 3, 2)
        layer = SequenceBias(2)

        self._check_one_layer(layer, x, batch_dim=1)

    def test_multihead_attention(self):
        x = torch.randn(16, 24, 32)

        layer = DPMultiheadAttention(32, 1)
        self._check_one_layer(layer, x, x, x, batch_dim=1)

        layer = DPMultiheadAttention(32,
                                     1,
                                     bias=True,
                                     add_bias_kv=True,
                                     dropout=0.05)
        self._check_one_layer(layer, x, x, x, batch_dim=1)

        layer = DPMultiheadAttention(32, 1, bias=True, add_bias_kv=True)
        self._check_one_layer(layer, x, x, x, batch_dim=1)

        layer = DPMultiheadAttention(32,
                                     1,
                                     bias=True,
                                     add_bias_kv=True,
                                     add_zero_attn=True)
        self._check_one_layer(layer, x, x, x, batch_dim=1)

        q = torch.randn(16, 24, 32)
        k = torch.randn(20, 24, 28)
        v = torch.randn(20, 24, 28)
        layer = DPMultiheadAttention(32,
                                     1,
                                     bias=True,
                                     add_bias_kv=True,
                                     add_zero_attn=True,
                                     kdim=28,
                                     vdim=28)
        self._check_one_layer(layer, q, k, v, batch_dim=1)

    def test_embedding(self):
        layer = nn.Embedding(256, 100)
        x1 = torch.randint(0, 255, (128, 42)).long()
        x2 = torch.randint(0, 255, (64, )).long()
        self._check_one_layer(layer, x1)
        self._check_one_layer(layer, x2)
예제 #5
0
class LayersGradTest(unittest.TestCase):
    def setUp(self):
        self.validator = DPModelInspector()

    def _reset_seeds(self):
        torch.manual_seed(1337)
        torch.cuda.manual_seed(1337)

    def _run_once(self, layer, criterion, *args):
        self._reset_seeds()
        layer.zero_grad()
        output = layer(*args)
        if isinstance(output, tuple):
            output = output[0]
        output = output.squeeze()

        y = torch.zeros_like(output)
        loss = criterion(output, y)
        loss.backward()

    def _check_one_layer(self, layer, *args, **kwargs):
        self._check_one_layer_with_criterion(layer,
                                             nn.L1Loss(reduction="mean"),
                                             *args, **kwargs)
        self._check_one_layer_with_criterion(layer, nn.L1Loss(reduction="sum"),
                                             *args, **kwargs)

    def _check_one_layer_with_criterion(self, layer, criterion, *args,
                                        **kwargs):
        self.validator.validate(layer)
        for name, param in layer.named_parameters():
            if ("weight" in name) or ("bias" in name):
                nn.init.uniform_(param, -1.0, 1.0)

        # run without DP
        self._run_once(layer, criterion, *args)
        vanilla_run_grads = [(name, p.grad.detach())
                             for (name, p) in layer.named_parameters()
                             if p.requires_grad]

        # run with DP
        clipper = PerSampleGradientClipper(layer,
                                           ConstantFlatClipper(1e9),
                                           batch_first=kwargs.get(
                                               "batch_first", True),
                                           loss_reduction=criterion.reduction)
        self._run_once(layer, criterion, *args)

        for param_name, param in layer.named_parameters():
            if param.requires_grad:
                self.assertTrue(
                    hasattr(param, "grad_sample"),
                    f"Per-sample gradients haven't been computed for {param_name}",
                )

        clipper.clip_and_accumulate()
        clipper.pre_step()

        private_run_grads = [(name, p.grad.detach())
                             for (name, p) in layer.named_parameters()
                             if p.requires_grad]

        # compare
        for (vanilla_name, vanilla_grad), (private_name, private_grad) in zip(
                vanilla_run_grads, private_run_grads):
            assert vanilla_name == private_name

            self.assertTrue(
                torch.allclose(vanilla_grad,
                               private_grad,
                               atol=10e-5,
                               rtol=10e-3),
                f"Gradient mismatch. Parameter: {layer}.{vanilla_name}, loss: {criterion.reduction}",
            )

        clipper.close()

    def test_conv1d(self):
        x = torch.randn(64, 16, 24)
        layer = nn.Conv1d(16, 32, 3, 1)

        self._check_one_layer(layer, x)

    def test_conv2d(self):
        x = torch.randn(64, 16, 24, 24)
        layer = nn.Conv2d(16, 32, 3, 1)

        self._check_one_layer(layer, x)

    def test_linear(self):
        self._check_one_layer(nn.Linear(8, 4), torch.randn(16, 8))
        self._check_one_layer(nn.Linear(8, 4), torch.randn(16, 8, 8))

    def test_layernorm(self):
        x = torch.randn(64, 16, 24, 24)

        self._check_one_layer(nn.LayerNorm(24), x)
        self._check_one_layer(nn.LayerNorm((24, 24)), x)
        self._check_one_layer(nn.LayerNorm((16, 24, 24)), x)

    def test_groupnorm(self):
        self._check_one_layer(nn.GroupNorm(4, 16), torch.randn(64, 16, 10))
        self._check_one_layer(nn.GroupNorm(4, 16), torch.randn(64, 16, 10, 9))
        self._check_one_layer(nn.GroupNorm(4, 16),
                              torch.randn(64, 16, 10, 9, 8))

    def test_instancenorm(self):
        self._check_one_layer(nn.InstanceNorm1d(16, affine=True),
                              torch.randn(64, 16, 10))
        self._check_one_layer(nn.InstanceNorm2d(16, affine=True),
                              torch.randn(64, 16, 10, 9))
        self._check_one_layer(nn.InstanceNorm3d(16, affine=True),
                              torch.randn(64, 16, 10, 9, 8))

    def test_sequence_bias(self):
        x = torch.randn(4, 3, 2)
        layer = SequenceBias(2)

        self._check_one_layer(layer, x, batch_first=False)

    def test_multihead_attention(self):
        x = torch.randn(16, 24, 32)

        layer = DPMultiheadAttention(32, 1)
        self._check_one_layer(layer, x, x, x, batch_first=False)

        layer = DPMultiheadAttention(32,
                                     1,
                                     bias=True,
                                     add_bias_kv=True,
                                     dropout=0.05)
        self._check_one_layer(layer, x, x, x, batch_first=False)

        layer = DPMultiheadAttention(32, 1, bias=True, add_bias_kv=True)
        self._check_one_layer(layer, x, x, x, batch_first=False)

        layer = DPMultiheadAttention(32,
                                     1,
                                     bias=True,
                                     add_bias_kv=True,
                                     add_zero_attn=True)
        self._check_one_layer(layer, x, x, x, batch_first=False)

        q = torch.randn(16, 24, 32)
        k = torch.randn(20, 24, 28)
        v = torch.randn(20, 24, 28)
        layer = DPMultiheadAttention(32,
                                     1,
                                     bias=True,
                                     add_bias_kv=True,
                                     add_zero_attn=True,
                                     kdim=28,
                                     vdim=28)
        self._check_one_layer(layer, q, k, v, batch_first=False)

    def test_embedding(self):
        layer = nn.Embedding(256, 100)
        x1 = torch.randint(0, 255, (128, 42)).long()
        x2 = torch.randint(0, 255, (64, )).long()
        self._check_one_layer(layer, x1)
        self._check_one_layer(layer, x2)