示例#1
0
    def influence(  # type: ignore[override]
        self,
        inputs: Union[Tensor, Tuple[Tensor, ...]],
        top_k: int = 1,
        additional_forward_args: Optional[Any] = None,
        load_src_from_disk: bool = True,
        **kwargs: Any,
    ) -> Dict:
        r"""
        Args:
            inputs (tensor or tuple of tensors): Batch of examples for which influential
                    instances are computed. They are passed to the forward_func. The
                    first dimension in `inputs` tensor or tuple of tensors corresponds
                    to the batch size. A tuple of tensors is only passed in if this
                    is the input form that `module` accepts.
            top_k (int): The number of top-matching activations to return
            additional_forward_args (optional):  Additional arguments that will be
                    passed to forward_func after inputs.
            load_src_from_disk (bool): Loads activations for `influence_src_dataset`
                    where possible. Setting to False would force regeneration of
                    activations.
            load_input_from_disk (bool): Regenerates activations for inputs by default
                    and removes previous `inputs` activations that are flagged with
                    `inputs_id`. Setting to True will load prior matching inputs
                    activations. Note that this could lead to unexpected behavior if
                    `inputs_id` is not configured properly and activations are loaded
                    for a different, prior `inputs`.
            inputs_id (str): Used to identify inputs for loading activations.

            **kwargs: Additional key-value arguments that are necessary for specific
                    implementation of `DataInfluence` abstract class.

        Returns:

            influences (dict): Returns the influential instances retrieved from
            `influence_src_dataset` for each test example represented through a
            tensor or a tuple of tensor in `inputs`. Returned influential
            examples are represented as dict, with keys corresponding to
            the layer names passed in `layers`. Each value in the dict is a
            tuple containing the indices and values for the top k similarities
            from `influence_src_dataset` by the chosen metric. The first value
            in the tuple corresponds to the indices corresponding to the top k
            most similar examples, and the second value is the similarity score.
            The batch dimension corresponds to the batch dimension of `inputs`.
            If inputs.shape[0] == 5, then dict[`layer_name`][0].shape[0] == 5.
            These tensors will be of shape (inputs.shape[0], top_k).
        """
        inputs_batch_size = (inputs[0].shape[0]
                             if isinstance(inputs, tuple) else inputs.shape[0])

        influences: Dict[str, Any] = {}

        layer_AVDatasets = AV.generate_dataset_activations(
            self.activation_dir,
            self.module,
            self.model_id,
            self.layers,
            DataLoader(self.influence_src_dataset,
                       self.batch_size,
                       shuffle=False),
            identifier="src",
            load_from_disk=load_src_from_disk,
            return_activations=True,
        )

        assert layer_AVDatasets is not None and not isinstance(
            layer_AVDatasets, AV.AVDataset)

        layer_modules = [
            common._get_module_from_name(self.module, layer)
            for layer in self.layers
        ]
        test_activations = LayerActivation(self.module,
                                           layer_modules).attribute(
                                               inputs, additional_forward_args)

        minmax = self.similarity_direction == "max"

        # av_inputs shape: (inputs_batch_size, *) e.g. (inputs_batch_size, N, C, H, W)
        # av_src shape: (self.batch_size, *) e.g. (self.batch_size, N, C, H, W)
        test_activations = (test_activations
                            if len(self.layers) > 1 else [test_activations])
        for i, (layer, layer_AVDataset) in enumerate(
                zip(self.layers, layer_AVDatasets)):
            topk_val, topk_idx = torch.Tensor(), torch.Tensor().long()
            zero_acts = torch.Tensor().long()

            av_inputs = test_activations[i]
            src_loader = DataLoader(layer_AVDataset)
            for j, av_src in enumerate(src_loader):
                av_src = av_src.squeeze(0)

                similarity = self.similarity_metric(av_inputs, av_src)
                msg = (
                    "Output of custom similarity does not meet required dimensions. "
                    f"Your output has shape {similarity.shape}.\nPlease ensure the "
                    "output shape matches (inputs_batch_size, src_dataset_batch_size), "
                    f"which should be {(inputs_batch_size, self.batch_size)}.")
                assert similarity.shape == (inputs_batch_size,
                                            av_src.shape[0]), msg
                if hasattr(self, "replace_nan"):
                    idx = (similarity == self.replace_nan).nonzero()
                    zero_acts = torch.cat((zero_acts, idx))
                r"""
                TODO: For models that can have tuples as activations, we should
                allow similarity metrics to accept tuples, support topk selection.
                """

                topk_batch = min(top_k, self.batch_size)
                values, indices = torch.topk(similarity,
                                             topk_batch,
                                             dim=1,
                                             largest=minmax)
                indices += int(j * self.batch_size)

                topk_val = torch.cat((topk_val, values), dim=1)
                topk_idx = torch.cat((topk_idx, indices), dim=1)

                # can modify how often to sort for efficiency? minor
                sort_idx = torch.argsort(topk_val, dim=1, descending=minmax)
                topk_val = torch.gather(topk_val, 1, sort_idx[:, :top_k])
                topk_idx = torch.gather(topk_idx, 1, sort_idx[:, :top_k])

            influences[layer] = (topk_idx, topk_val)

            if torch.numel(zero_acts != 0):
                zero_warning = (
                    f"Layer {layer} has zero-vector activations for some inputs. This "
                    "may cause undefined behavior for cosine similarity. The indices "
                    "for the offending inputs will be included under the key "
                    f"'zero_acts-{layer}' in the output dictionary. Indices are "
                    "returned as a tensor with [inputs_idx, src_dataset_idx] pairs "
                    "which may have corrupted similarity scores.")
                warnings.warn(zero_warning, RuntimeWarning)
                key = "-".join(["zero_acts", layer])
                influences[key] = zero_acts

        return influences
示例#2
0
文件: test_av.py 项目: pytorch/captum
    def test_generate_dataset_activations(self) -> None:
        with tempfile.TemporaryDirectory() as tmpdir:
            num_features = 4
            low, high = 0, 16
            batch_size = high // 2
            mymodel = BasicLinearReLULinear(num_features)
            mydata = RangeDataset(low, high, num_features)
            layers: List[str] = [
                value[0] for value in mymodel.named_modules() if value[0]
            ]

            # First AV generation on last 2 layers
            layer_AVDatasets = AV.generate_dataset_activations(
                tmpdir,
                mymodel,
                "model_id1",
                layers[1:],
                DataLoader(mydata, batch_size, shuffle=False),
                "src",
                return_activations=True,
            )

            av_src = AV._construct_file_search(tmpdir,
                                               model_id="model_id1",
                                               identifier="src")
            av_src = glob.glob(av_src)
            self.assertEqual(len(av_src), high / batch_size * len(layers[1:]))

            self.assertTrue(isinstance(layer_AVDatasets, list))
            layer_AVDatasets = cast(list, layer_AVDatasets)
            self.assertEqual(len(layer_AVDatasets), len(layers[1:]))
            for layer_AVDataset in layer_AVDatasets:
                self.assertEqual(len(layer_AVDataset), high / batch_size)

            # Second AV generation on first 2 layers.
            # Second layer overlaps with existing activations, should be loaded.
            layer_AVDatasets = AV.generate_dataset_activations(
                tmpdir,
                mymodel,
                "model_id1",
                layers[:2],
                DataLoader(mydata, batch_size, shuffle=False),
                "src",
                return_activations=True,
            )

            av_src = AV._construct_file_search(tmpdir,
                                               model_id="model_id1",
                                               identifier="src")
            av_src = glob.glob(av_src)
            self.assertEqual(len(av_src), high / batch_size * len(layers))

            self.assertTrue(isinstance(layer_AVDatasets, list))
            layer_AVDatasets = cast(list, layer_AVDatasets)
            self.assertEqual(len(layer_AVDatasets), len(layers[:2]))
            for layer_AVDataset in layer_AVDatasets:
                self.assertEqual(len(layer_AVDataset), high / batch_size)

            # check that if return_activations is False, None is returned
            self.assertIsNone(
                AV.generate_dataset_activations(
                    tmpdir,
                    mymodel,
                    "model_id1",
                    layers[:2],
                    DataLoader(mydata, batch_size, shuffle=False),
                    "src",
                    return_activations=False,
                ))