def influence( # type: ignore[override] self, inputs: Union[Tensor, Tuple[Tensor, ...]], top_k: int = 1, additional_forward_args: Optional[Any] = None, load_src_from_disk: bool = True, **kwargs: Any, ) -> Dict: r""" Args: inputs (tensor or tuple of tensors): Batch of examples for which influential instances are computed. They are passed to the forward_func. The first dimension in `inputs` tensor or tuple of tensors corresponds to the batch size. A tuple of tensors is only passed in if this is the input form that `module` accepts. top_k (int): The number of top-matching activations to return additional_forward_args (optional): Additional arguments that will be passed to forward_func after inputs. load_src_from_disk (bool): Loads activations for `influence_src_dataset` where possible. Setting to False would force regeneration of activations. load_input_from_disk (bool): Regenerates activations for inputs by default and removes previous `inputs` activations that are flagged with `inputs_id`. Setting to True will load prior matching inputs activations. Note that this could lead to unexpected behavior if `inputs_id` is not configured properly and activations are loaded for a different, prior `inputs`. inputs_id (str): Used to identify inputs for loading activations. **kwargs: Additional key-value arguments that are necessary for specific implementation of `DataInfluence` abstract class. Returns: influences (dict): Returns the influential instances retrieved from `influence_src_dataset` for each test example represented through a tensor or a tuple of tensor in `inputs`. Returned influential examples are represented as dict, with keys corresponding to the layer names passed in `layers`. Each value in the dict is a tuple containing the indices and values for the top k similarities from `influence_src_dataset` by the chosen metric. The first value in the tuple corresponds to the indices corresponding to the top k most similar examples, and the second value is the similarity score. The batch dimension corresponds to the batch dimension of `inputs`. If inputs.shape[0] == 5, then dict[`layer_name`][0].shape[0] == 5. These tensors will be of shape (inputs.shape[0], top_k). """ inputs_batch_size = (inputs[0].shape[0] if isinstance(inputs, tuple) else inputs.shape[0]) influences: Dict[str, Any] = {} layer_AVDatasets = AV.generate_dataset_activations( self.activation_dir, self.module, self.model_id, self.layers, DataLoader(self.influence_src_dataset, self.batch_size, shuffle=False), identifier="src", load_from_disk=load_src_from_disk, return_activations=True, ) assert layer_AVDatasets is not None and not isinstance( layer_AVDatasets, AV.AVDataset) layer_modules = [ common._get_module_from_name(self.module, layer) for layer in self.layers ] test_activations = LayerActivation(self.module, layer_modules).attribute( inputs, additional_forward_args) minmax = self.similarity_direction == "max" # av_inputs shape: (inputs_batch_size, *) e.g. (inputs_batch_size, N, C, H, W) # av_src shape: (self.batch_size, *) e.g. (self.batch_size, N, C, H, W) test_activations = (test_activations if len(self.layers) > 1 else [test_activations]) for i, (layer, layer_AVDataset) in enumerate( zip(self.layers, layer_AVDatasets)): topk_val, topk_idx = torch.Tensor(), torch.Tensor().long() zero_acts = torch.Tensor().long() av_inputs = test_activations[i] src_loader = DataLoader(layer_AVDataset) for j, av_src in enumerate(src_loader): av_src = av_src.squeeze(0) similarity = self.similarity_metric(av_inputs, av_src) msg = ( "Output of custom similarity does not meet required dimensions. " f"Your output has shape {similarity.shape}.\nPlease ensure the " "output shape matches (inputs_batch_size, src_dataset_batch_size), " f"which should be {(inputs_batch_size, self.batch_size)}.") assert similarity.shape == (inputs_batch_size, av_src.shape[0]), msg if hasattr(self, "replace_nan"): idx = (similarity == self.replace_nan).nonzero() zero_acts = torch.cat((zero_acts, idx)) r""" TODO: For models that can have tuples as activations, we should allow similarity metrics to accept tuples, support topk selection. """ topk_batch = min(top_k, self.batch_size) values, indices = torch.topk(similarity, topk_batch, dim=1, largest=minmax) indices += int(j * self.batch_size) topk_val = torch.cat((topk_val, values), dim=1) topk_idx = torch.cat((topk_idx, indices), dim=1) # can modify how often to sort for efficiency? minor sort_idx = torch.argsort(topk_val, dim=1, descending=minmax) topk_val = torch.gather(topk_val, 1, sort_idx[:, :top_k]) topk_idx = torch.gather(topk_idx, 1, sort_idx[:, :top_k]) influences[layer] = (topk_idx, topk_val) if torch.numel(zero_acts != 0): zero_warning = ( f"Layer {layer} has zero-vector activations for some inputs. This " "may cause undefined behavior for cosine similarity. The indices " "for the offending inputs will be included under the key " f"'zero_acts-{layer}' in the output dictionary. Indices are " "returned as a tensor with [inputs_idx, src_dataset_idx] pairs " "which may have corrupted similarity scores.") warnings.warn(zero_warning, RuntimeWarning) key = "-".join(["zero_acts", layer]) influences[key] = zero_acts return influences
def test_generate_dataset_activations(self) -> None: with tempfile.TemporaryDirectory() as tmpdir: num_features = 4 low, high = 0, 16 batch_size = high // 2 mymodel = BasicLinearReLULinear(num_features) mydata = RangeDataset(low, high, num_features) layers: List[str] = [ value[0] for value in mymodel.named_modules() if value[0] ] # First AV generation on last 2 layers layer_AVDatasets = AV.generate_dataset_activations( tmpdir, mymodel, "model_id1", layers[1:], DataLoader(mydata, batch_size, shuffle=False), "src", return_activations=True, ) av_src = AV._construct_file_search(tmpdir, model_id="model_id1", identifier="src") av_src = glob.glob(av_src) self.assertEqual(len(av_src), high / batch_size * len(layers[1:])) self.assertTrue(isinstance(layer_AVDatasets, list)) layer_AVDatasets = cast(list, layer_AVDatasets) self.assertEqual(len(layer_AVDatasets), len(layers[1:])) for layer_AVDataset in layer_AVDatasets: self.assertEqual(len(layer_AVDataset), high / batch_size) # Second AV generation on first 2 layers. # Second layer overlaps with existing activations, should be loaded. layer_AVDatasets = AV.generate_dataset_activations( tmpdir, mymodel, "model_id1", layers[:2], DataLoader(mydata, batch_size, shuffle=False), "src", return_activations=True, ) av_src = AV._construct_file_search(tmpdir, model_id="model_id1", identifier="src") av_src = glob.glob(av_src) self.assertEqual(len(av_src), high / batch_size * len(layers)) self.assertTrue(isinstance(layer_AVDatasets, list)) layer_AVDatasets = cast(list, layer_AVDatasets) self.assertEqual(len(layer_AVDatasets), len(layers[:2])) for layer_AVDataset in layer_AVDatasets: self.assertEqual(len(layer_AVDataset), high / batch_size) # check that if return_activations is False, None is returned self.assertIsNone( AV.generate_dataset_activations( tmpdir, mymodel, "model_id1", layers[:2], DataLoader(mydata, batch_size, shuffle=False), "src", return_activations=False, ))