def test_attack_targeted(self) -> None: model = BasicModel() input = torch.tensor([[9.0, 10.0, -6.0, -1.0]], requires_grad=True) adv = PGD(model) perturbed_input = adv.perturb(input, 0.2, 0.1, 3, 3, targeted=True) assertArraysAlmostEqual(torch.flatten(perturbed_input).tolist(), [9.0, 10.0, -6.0, -1.2], delta=0.01)
def _assert_attribution(self, attribution: Tensor, expected: Tensor) -> None: expected = torch.abs(expected) assertArraysAlmostEqual( expected.detach().numpy().flatten().tolist(), attribution.detach().numpy().flatten().tolist(), delta=0.5, )
def test_gradient_basic_2(self) -> None: model = BasicModel() input = torch.tensor([[-3.0]], requires_grad=True) input.grad = torch.tensor([[14.0]]) grads = compute_gradients(model, input)[0] assertArraysAlmostEqual(grads.squeeze(0).tolist(), [1.0], delta=0.01) # Verify grad attribute is not altered assertArraysAlmostEqual(input.grad.squeeze(0).tolist(), [14.0], delta=0.0)
def test_gradient_multiinput(self) -> None: model = BasicModel6_MultiTensor() input1 = torch.tensor([[-3.0, -5.0]], requires_grad=True) input2 = torch.tensor([[-5.0, 2.0]], requires_grad=True) grads = compute_gradients(model, (input1, input2)) assertArraysAlmostEqual(grads[0].squeeze(0).tolist(), [0.0, 1.0], delta=0.01) assertArraysAlmostEqual(grads[1].squeeze(0).tolist(), [0.0, 1.0], delta=0.01)
def test_attack_nontargeted(self) -> None: model = BasicModel() input = torch.tensor([[2.0, -9.0, 9.0, 1.0, -3.0]]) adv = PGD(model) perturbed_input = adv.perturb(input, 0.25, 0.1, 2, 4) assertArraysAlmostEqual( torch.flatten(perturbed_input).tolist(), [2.0, -9.0, 9.0, 1.0, -2.8], delta=0.01, )
def test_gradient_additional_args_2(self) -> None: model = BasicModel5_MultiArgs() input1 = torch.tensor([[-10.0]], requires_grad=True) input2 = torch.tensor([[6.0]], requires_grad=True) grads = compute_gradients(model, (input1, input2), additional_forward_args=([3, -4], )) assertArraysAlmostEqual(grads[0].squeeze(0).tolist(), [0.0], delta=0.01) assertArraysAlmostEqual(grads[1].squeeze(0).tolist(), [4.0], delta=0.01)
def test_gradient_target_list(self) -> None: model = BasicModel2() input1 = torch.tensor([[4.0, -1.0], [3.0, 10.0]], requires_grad=True) input2 = torch.tensor([[2.0, -5.0], [-2.0, 1.0]], requires_grad=True) grads = compute_gradients(model, (input1, input2), target_ind=[0, 1]) assertArraysAlmostEqual(torch.flatten(grads[0]).tolist(), [1.0, 0.0, 0.0, 1.0], delta=0.01) assertArraysAlmostEqual(torch.flatten(grads[1]).tolist(), [-1.0, 0.0, 0.0, -1.0], delta=0.01)
def test_gradient_target_tuple(self) -> None: model = BasicModel() input = torch.tensor( [[[4.0, 2.0], [-1.0, -2.0]], [[3.0, -4.0], [10.0, 5.0]]], requires_grad=True) grads = compute_gradients(model, input, target_ind=(0, 1))[0] assertArraysAlmostEqual( torch.flatten(grads).tolist(), [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0], delta=0.01, )
def test_layer_gradient_output(self) -> None: model = BasicModel_MultiLayer() input = torch.tensor([[5.0, 2.0, 1.0]], requires_grad=True) grads, eval = compute_layer_gradients_and_eval(model, model.linear2, input, target_ind=1) assertArraysAlmostEqual(grads[0].squeeze(0).tolist(), [0.0, 1.0], delta=0.01) assertArraysAlmostEqual(eval[0].squeeze(0).tolist(), [26.0, 28.0], delta=0.01)
def test_attack_3dimensional_input(self) -> None: model = BasicModel() input = torch.tensor( [[[4.0, 2.0], [-1.0, -2.0]], [[3.0, -4.0], [10.0, 5.0]]], requires_grad=True) adv = PGD(model) perturbed_input = adv.perturb(input, 0.25, 0.1, 3, (0, 1)) assertArraysAlmostEqual( torch.flatten(perturbed_input).tolist(), [4.0, 2.0, -1.0, -2.0, 3.0, -3.75, 10.0, 5.0], delta=0.01, )
def _compute_attribution_batch_helper_evaluate( self, model: Module, inputs: TensorOrTupleOfTensorsGeneric, baselines: Union[None, Tensor, Tuple[Tensor, ...]] = None, target: Union[None, int] = None, additional_forward_args: Any = None, approximation_method: str = "gausslegendre", ) -> None: ig = IntegratedGradients(model) if not isinstance(inputs, tuple): inputs = (inputs, ) # type: ignore inputs: Tuple[Tensor, ...] if baselines is not None and not isinstance(baselines, tuple): baselines = (baselines, ) if baselines is None: baselines = _tensorize_baseline(inputs, _zeros(inputs)) for internal_batch_size in [None, 10, 20]: attributions, delta = ig.attribute( inputs, baselines, additional_forward_args=additional_forward_args, method=approximation_method, n_steps=100, target=target, internal_batch_size=internal_batch_size, return_convergence_delta=True, ) total_delta = 0.0 for i in range(inputs[0].shape[0]): attributions_indiv, delta_indiv = ig.attribute( tuple(input[i:i + 1] for input in inputs), tuple(baseline[i:i + 1] for baseline in baselines), additional_forward_args=additional_forward_args, method=approximation_method, n_steps=100, target=target, internal_batch_size=internal_batch_size, return_convergence_delta=True, ) total_delta += abs(delta_indiv).sum().item() for j in range(len(attributions)): assertArraysAlmostEqual( attributions[j][i:i + 1].squeeze(0).tolist(), attributions_indiv[j].squeeze(0).tolist(), ) self.assertAlmostEqual(abs(delta).sum().item(), total_delta, delta=0.005)
def test_lin_maxpool_lin_classification(self) -> None: inputs = torch.ones(2, 4) baselines = torch.tensor([[1, 2, 3, 9], [4, 8, 6, 7]]).float() model = LinearMaxPoolLinearModel() dl = DeepLift(model) attrs, delta = dl.attribute( inputs, baselines, target=0, return_convergence_delta=True ) expected = [[0.0, 0.0, 0.0, -8.0], [0.0, -7.0, 0.0, 0.0]] expected_delta = [0.0, 0.0] assertArraysAlmostEqual(attrs.detach().numpy(), expected) assertArraysAlmostEqual(delta.detach().numpy(), expected_delta)
def test_attack_loss_defined(self) -> None: model = BasicModel_MultiLayer() add_input = torch.tensor([[-1.0, 2.0, 2.0]]) input = torch.tensor([[1.0, 6.0, -3.0]]) labels = torch.tensor([0]) loss_func = CrossEntropyLoss(reduction="none") adv = FGSM(model, loss_func) perturbed_input = adv.perturb(input, 0.2, labels, additional_forward_args=(add_input, )) assertArraysAlmostEqual(perturbed_input.squeeze(0).tolist(), [1.0, 6.0, -3.0], delta=0.01)
def test_layer_gradient_relu_input_inplace(self) -> None: model = BasicModel_MultiLayer(inplace=True) input = torch.tensor([[5.0, 2.0, 1.0]], requires_grad=True) grads, eval = compute_layer_gradients_and_eval( model, model.relu, input, target_ind=1, attribute_to_layer_input=True) assertArraysAlmostEqual(grads[0].squeeze(0).tolist(), [0.0, 1.0, 1.0, 1.0], delta=0.01) assertArraysAlmostEqual(eval[0].squeeze(0).tolist(), [-2.0, 9.0, 9.0, 9.0], delta=0.01)
def _assert_multi_variable( self, type: str, approximation_method: str = "gausslegendre", multiply_by_inputs: bool = True, ) -> None: model = BasicModel2() input1 = torch.tensor([3.0]) input2 = torch.tensor([1.0], requires_grad=True) baseline1 = torch.tensor([0.0]) baseline2 = torch.tensor([0.0]) attributions1 = self._compute_attribution_and_evaluate( model, (input1, input2), (baseline1, baseline2), type=type, approximation_method=approximation_method, multiply_by_inputs=multiply_by_inputs, ) if type == "vanilla": assertArraysAlmostEqual( attributions1[0].tolist(), [1.5] if multiply_by_inputs else [0.5], delta=0.05, ) assertArraysAlmostEqual( attributions1[1].tolist(), [-0.5] if multiply_by_inputs else [-0.5], delta=0.05, ) model = BasicModel3() attributions2 = self._compute_attribution_and_evaluate( model, (input1, input2), (baseline1, baseline2), type=type, approximation_method=approximation_method, multiply_by_inputs=multiply_by_inputs, ) if type == "vanilla": assertArraysAlmostEqual( attributions2[0].tolist(), [1.5] if multiply_by_inputs else [0.5], delta=0.05, ) assertArraysAlmostEqual( attributions2[1].tolist(), [-0.5] if multiply_by_inputs else [-0.5], delta=0.05, ) # Verifies implementation invariance self.assertEqual( sum(attribution for attribution in attributions1), sum(attribution for attribution in attributions2), )
def test_attack_multiinput(self) -> None: model = BasicModel2() input1 = torch.tensor([[4.0, -1.0], [3.0, 10.0]], requires_grad=True) input2 = torch.tensor([[2.0, -5.0], [-2.0, 1.0]], requires_grad=True) adv = PGD(model) perturbed_input = adv.perturb((input1, input2), 0.25, 0.1, 3, 0, norm="L2") answer = ([3.75, -1.0, 2.75, 10.0], [2.25, -5.0, -2.0, 1.0]) for i in range(len(perturbed_input)): assertArraysAlmostEqual(torch.flatten(perturbed_input[i]).tolist(), answer[i], delta=0.01)
def _conductance_input_test_assert( self, model: Module, target_layer: Module, test_input: TensorOrTupleOfTensorsGeneric, test_neuron: Union[int, Tuple[int, ...]], expected_input_conductance: Union[List[float], Tuple[List[List[float]], ...]], additional_input: Any = None, multiply_by_inputs: bool = True, ) -> None: for internal_batch_size in (None, 5, 20): cond = NeuronConductance( model, target_layer, multiply_by_inputs=multiply_by_inputs, ) self.assertEquals(cond.multiplies_by_inputs, multiply_by_inputs) attributions = cond.attribute( test_input, test_neuron, target=0, n_steps=500, method="gausslegendre", additional_forward_args=additional_input, internal_batch_size=internal_batch_size, ) if isinstance(expected_input_conductance, tuple): for i in range(len(expected_input_conductance)): for j in range(len(expected_input_conductance[i])): assertArraysAlmostEqual( attributions[i][j:j + 1].squeeze(0).tolist(), expected_input_conductance[i][j], delta=0.1, ) else: if isinstance(attributions, Tensor): assertArraysAlmostEqual( attributions.squeeze(0).tolist(), expected_input_conductance, delta=0.1, ) else: raise AssertionError( "Attributions not returning a Tensor when expected.")
def test_basic_infidelity_multiple_with_batching(self) -> None: input1 = torch.tensor([3.0] * 20) input2 = torch.tensor([1.0] * 20) expected = torch.zeros(20) infid1 = self.basic_model_assert( BasicModel2(), (input1, input2), expected, n_perturb_samples=5, max_batch_size=21, ) infid2 = self.basic_model_assert( BasicModel2(), (input1, input2), expected, n_perturb_samples=5, max_batch_size=60, ) assertArraysAlmostEqual(infid1, infid2, 0.01)
def test_single_input(self) -> None: batch_size = 2 input_size = (6, ) constant_value = 10000 def forward_func(x: Tensor) -> Tensor: return x.sum(dim=-1) feature_importance = FeaturePermutation(forward_func=forward_func) inp = torch.randn((batch_size, ) + input_size) inp[:, 0] = constant_value zeros = torch.zeros_like(inp[:, 0]) attribs = feature_importance.attribute(inp) self.assertTrue( attribs.squeeze(0).size() == (batch_size, ) + input_size) assertArraysAlmostEqual(attribs[:, 0], zeros) self.assertTrue((attribs[:, 1:input_size[0]].abs() > 0).all())
def test_gradient_target_int(self) -> None: model = BasicModel2() input1 = torch.tensor([[4.0, -1.0]], requires_grad=True) input2 = torch.tensor([[2.0, 5.0]], requires_grad=True) grads0 = compute_gradients(model, (input1, input2), target_ind=0) grads1 = compute_gradients(model, (input1, input2), target_ind=1) assertArraysAlmostEqual(grads0[0].squeeze(0).tolist(), [1.0, 0.0], delta=0.01) assertArraysAlmostEqual(grads0[1].squeeze(0).tolist(), [-1.0, 0.0], delta=0.01) assertArraysAlmostEqual(grads1[0].squeeze(0).tolist(), [0.0, 0.0], delta=0.01) assertArraysAlmostEqual(grads1[1].squeeze(0).tolist(), [0.0, 0.0], delta=0.01)
def test_custom_module(self): input1 = torch.tensor([[3, 2, 0], [1, 2, 4]]) input2 = torch.tensor([[0, 1, 0], [1, 2, 3]]) model = BasicEmbeddingModel() output = model(input1, input2) expected = model.embedding2(input=input2) # in this case we make interpretable the custom embedding layer - TextModule interpretable_embedding = configure_interpretable_embedding_layer( model, "embedding2") actual = interpretable_embedding.indices_to_embeddings(input=input2) output_interpretable_models = model(input1, actual) assertArraysAlmostEqual(output, output_interpretable_models) # using assertArraysAlmostEqual instead of assertTensorAlmostEqual because # it is important and necessary that each element in comparing tensors # match exactly. assertArraysAlmostEqual(expected, actual, 0.0) self.assertTrue( model.embedding2.__class__ is InterpretableEmbeddingBase) remove_interpretable_embedding_layer(model, interpretable_embedding) self.assertTrue(model.embedding2.__class__ is TextModule) self._assert_embeddings_equal(input2, output, interpretable_embedding)
def _ig_matching_test_assert( self, model: Module, output_layer: Module, test_input: Tensor, baseline: Union[None, Tensor] = None, ) -> None: out = model(test_input) input_attrib = IntegratedGradients(model) ig_attrib = NeuronIntegratedGradients(model, output_layer) for i in range(out.shape[1]): ig_vals = input_attrib.attribute(test_input, target=i, baselines=baseline) neuron_ig_vals = ig_attrib.attribute(test_input, (i, ), baselines=baseline) assertArraysAlmostEqual( ig_vals.reshape(-1).tolist(), neuron_ig_vals.reshape(-1).tolist(), delta=0.001, ) self.assertEqual(neuron_ig_vals.shape, test_input.shape)
def _assert_compare_with_emb_patching( self, input: Tensor, baseline: Tensor, additional_args: Tuple[Tensor, ...], multiply_by_inputs: bool = True, ): model = BasicEmbeddingModel(nested_second_embedding=True) lig = LayerIntegratedGradients(model, model.embedding1, multiply_by_inputs=multiply_by_inputs) attributions, delta = lig.attribute( input, baselines=baseline, additional_forward_args=additional_args, return_convergence_delta=True, ) # now let's interpret with standard integrated gradients and # the embeddings for monkey patching interpretable_embedding = configure_interpretable_embedding_layer( model, "embedding1") input_emb = interpretable_embedding.indices_to_embeddings(input) baseline_emb = interpretable_embedding.indices_to_embeddings(baseline) ig = IntegratedGradients(model, multiply_by_inputs=multiply_by_inputs) attributions_with_ig, delta_with_ig = ig.attribute( input_emb, baselines=baseline_emb, additional_forward_args=additional_args, target=0, return_convergence_delta=True, ) remove_interpretable_embedding_layer(model, interpretable_embedding) assertArraysAlmostEqual(attributions, attributions_with_ig) if multiply_by_inputs: assertArraysAlmostEqual(delta, delta_with_ig)
def _assert_compare_with_layer_conductance( self, model: Module, input: Tensor, attribute_to_layer_input: bool = False ): lc = LayerConductance(model, cast(Module, model.linear2)) # For large number of steps layer conductance and layer integrated gradients # become very close attribution, delta = lc.attribute( input, target=0, n_steps=1500, return_convergence_delta=True, attribute_to_layer_input=attribute_to_layer_input, ) lig = LayerIntegratedGradients(model, cast(Module, model.linear2)) attributions2, delta2 = lig.attribute( input, target=0, n_steps=1500, return_convergence_delta=True, attribute_to_layer_input=attribute_to_layer_input, ) assertArraysAlmostEqual(attribution, attributions2, 0.01) assertArraysAlmostEqual(delta, delta2, 0.5)
def test_attack_random_start(self) -> None: model = BasicModel() input = torch.tensor([[2.0, -9.0, 9.0, 1.0, -3.0]]) adv = PGD(model) perturbed_input = adv.perturb(input, 0.25, 0.1, 0, 4, random_start=True) assertArraysAlmostEqual( torch.flatten(perturbed_input).tolist(), [2.0, -9.0, 9.0, 1.0, -3.0], delta=0.25, ) perturbed_input = adv.perturb(input, 0.25, 0.1, 0, 4, norm="L2", random_start=True) norm = torch.norm((perturbed_input - input).squeeze()).numpy() self.assertLessEqual(norm, 0.25)
def _assert_steps_and_alphas( self, n, expected_step_sizes, expected_step_sizes_trapezoid, expected_left, expected_right, expected_middle, expected_trapezoid, ): step_sizes_left, alphas_left = riemann_builders(Riemann.left) step_sizes_right, alphas_right = riemann_builders(Riemann.right) step_sizes_middle, alphas_middle = riemann_builders(Riemann.middle) step_sizes_trapezoid, alphas_trapezoid = riemann_builders( Riemann.trapezoid) assertArraysAlmostEqual(expected_step_sizes, step_sizes_left(n)) assertArraysAlmostEqual(expected_step_sizes, step_sizes_right(n)) assertArraysAlmostEqual(expected_step_sizes, step_sizes_middle(n)) assertArraysAlmostEqual(expected_step_sizes_trapezoid, step_sizes_trapezoid(n)) assertArraysAlmostEqual(expected_left, alphas_left(n)) assertArraysAlmostEqual(expected_right, alphas_right(n)) assertArraysAlmostEqual(expected_middle, alphas_middle(n)) assertArraysAlmostEqual(expected_trapezoid, alphas_trapezoid(n))
def _compute_attribution_and_evaluate( self, model: Module, inputs: TensorOrTupleOfTensorsGeneric, baselines: BaselineType = None, target: Union[None, int] = None, additional_forward_args: Any = None, type: str = "vanilla", approximation_method: str = "gausslegendre", multiply_by_inputs=True, ) -> Tuple[Tensor, ...]: r""" attrib_type: 'vanilla', 'smoothgrad', 'smoothgrad_sq', 'vargrad' """ ig = IntegratedGradients(model, multiply_by_inputs=multiply_by_inputs) self.assertEquals(ig.multiplies_by_inputs, multiply_by_inputs) if not isinstance(inputs, tuple): inputs = (inputs,) # type: ignore inputs: Tuple[Tensor, ...] if baselines is not None and not isinstance(baselines, tuple): baselines = (baselines,) if baselines is None: baselines = _tensorize_baseline(inputs, _zeros(inputs)) if type == "vanilla": attributions, delta = ig.attribute( inputs, baselines, additional_forward_args=additional_forward_args, method=approximation_method, n_steps=500, target=target, return_convergence_delta=True, ) model.zero_grad() attributions_without_delta, delta = ig.attribute( inputs, baselines, additional_forward_args=additional_forward_args, method=approximation_method, n_steps=500, target=target, return_convergence_delta=True, ) model.zero_grad() self.assertEqual([inputs[0].shape[0]], list(delta.shape)) delta_external = ig.compute_convergence_delta( attributions, baselines, inputs, target=target, additional_forward_args=additional_forward_args, ) assertArraysAlmostEqual(delta, delta_external, 0.0) else: nt = NoiseTunnel(ig) n_samples = 5 attributions, delta = nt.attribute( inputs, nt_type=type, nt_samples=n_samples, stdevs=0.00000002, baselines=baselines, target=target, additional_forward_args=additional_forward_args, method=approximation_method, n_steps=500, return_convergence_delta=True, ) with self.assertWarns(DeprecationWarning): attributions_without_delta = nt.attribute( inputs, nt_type=type, n_samples=n_samples, stdevs=0.00000002, baselines=baselines, target=target, additional_forward_args=additional_forward_args, method=approximation_method, n_steps=500, ) self.assertEquals(nt.multiplies_by_inputs, multiply_by_inputs) self.assertEqual([inputs[0].shape[0] * n_samples], list(delta.shape)) for input, attribution in zip(inputs, attributions): self.assertEqual(attribution.shape, input.shape) if multiply_by_inputs: self.assertTrue(all(abs(delta.numpy().flatten()) < 0.07)) # compare attributions retrieved with and without # `return_convergence_delta` flag for attribution, attribution_without_delta in zip( attributions, attributions_without_delta ): assertTensorAlmostEqual( self, attribution, attribution_without_delta, delta=0.05 ) return cast(Tuple[Tensor, ...], attributions)
def _assert_shap_ig_comparision(self, attributions1: Tuple[Tensor, ...], attributions2: Tuple[Tensor, ...]) -> None: for attribution1, attribution2 in zip(attributions1, attributions2): for attr_row1, attr_row2 in zip(attribution1.detach().numpy(), attribution2.detach().numpy()): assertArraysAlmostEqual(attr_row1, attr_row2, delta=0.005)
def test_gradient_inplace(self) -> None: model = BasicModel_MultiLayer(inplace=True) input = torch.tensor([[1.0, 6.0, -3.0]], requires_grad=True) grads = compute_gradients(model, input, target_ind=0)[0] assertArraysAlmostEqual(grads.squeeze(0).tolist(), [3.0, 3.0, 3.0], delta=0.01)
def _deeplift_assert( self, model: Module, attr_method: Union[DeepLift, DeepLiftShap], inputs: Tuple[Tensor, ...], baselines, custom_attr_func: Callable[..., Tuple[Tensor, ...]] = None, ) -> None: input_bsz = len(inputs[0]) if callable(baselines): baseline_parameters = signature(baselines).parameters if len(baseline_parameters) > 0: baselines = baselines(inputs) else: baselines = baselines() baseline_bsz = ( len(baselines[0]) if isinstance(baselines[0], torch.Tensor) else 1 ) # Run attribution multiple times to make sure that it is # working as expected for _ in range(5): model.zero_grad() attributions, delta = attr_method.attribute( inputs, baselines, return_convergence_delta=True, custom_attribution_func=custom_attr_func, ) attributions_without_delta = attr_method.attribute( inputs, baselines, custom_attribution_func=custom_attr_func ) for attribution, attribution_without_delta in zip( attributions, attributions_without_delta ): self.assertTrue( torch.all(torch.eq(attribution, attribution_without_delta)) ) if isinstance(attr_method, DeepLiftShap): self.assertEqual([input_bsz * baseline_bsz], list(delta.shape)) else: self.assertEqual([input_bsz], list(delta.shape)) delta_external = attr_method.compute_convergence_delta( attributions, baselines, inputs ) assertArraysAlmostEqual(delta, delta_external, 0.0) delta_condition = all(abs(delta.numpy().flatten()) < 0.00001) self.assertTrue( delta_condition, "The sum of attribution values {} is not " "nearly equal to the difference between the endpoint for " "some samples".format(delta), ) for input, attribution in zip(inputs, attributions): self.assertEqual(input.shape, attribution.shape) if ( isinstance(baselines[0], (int, float)) or inputs[0].shape == baselines[0].shape ): # Compare with Integrated Gradients ig = IntegratedGradients(model) attributions_ig = ig.attribute(inputs, baselines) assertAttributionComparision(self, attributions, attributions_ig)