def _get_single_output_relevance(self, layer, output): if self.attribute_to_layer_input: normalized_relevances = layer.rule.relevance_input else: normalized_relevances = layer.rule.relevance_output key_list = _sort_key_list(list(normalized_relevances.keys()), self.device_ids) normalized_relevances = _reduce_list( [normalized_relevances[device_id] for device_id in key_list]) if isinstance(normalized_relevances, tuple): return tuple(normalized_relevance * output.reshape((-1, ) + (1, ) * (normalized_relevance.dim() - 1)) for normalized_relevance in normalized_relevances) else: return normalized_relevances * output.reshape( (-1, ) + (1, ) * (normalized_relevances.dim() - 1))
def _gather_distributed_tensors( saved_layer: Dict[device, Tuple[Tensor, ...]], device_ids: Union[None, List[int]] = None, key_list: Union[None, List[device]] = None, ) -> Tuple[Tensor, ...]: r""" A helper function to concatenate intermediate layer results stored on different devices in `saved_layer`. `saved_layer` is a dictionary that contains `device_id` as a key and intermediate layer results (either the input or the output of the layer) stored on the device corresponding to the key. `key_list` is a list of devices in appropriate ordering for concatenation and if not provided, keys are sorted based on device ids. If only one key exists (standard model), key list simply has one element. """ if key_list is None: key_list = _sort_key_list(list(saved_layer.keys()), device_ids) return _reduce_list([saved_layer[device_id] for device_id in key_list])
def _neuron_gradients( inputs: Union[Tensor, Tuple[Tensor, ...]], saved_layer: Dict[device, Tuple[Tensor, ...]], key_list: List[device], gradient_neuron_selector: Union[int, Tuple[Union[int, slice], ...], Callable], ) -> Tuple[Tensor, ...]: with torch.autograd.set_grad_enabled(True): gradient_tensors = [] for key in key_list: current_out_tensor = _verify_select_neuron( saved_layer[key], gradient_neuron_selector) gradient_tensors.append( torch.autograd.grad( torch.unbind(current_out_tensor) if current_out_tensor.numel() > 1 else current_out_tensor, inputs, )) _total_gradients = _reduce_list(gradient_tensors, sum) return _total_gradients
def _batched_operator(operator: Callable[..., TupleOrTensorOrBoolGeneric], inputs: TensorOrTupleOfTensorsGeneric, additional_forward_args: Any = None, target_ind: TargetType = None, internal_batch_size: Union[None, int] = None, **kwargs: Any) -> TupleOrTensorOrBoolGeneric: """ Batches the operation of the given operator, applying the given batch size to inputs and additional forward arguments, and returning the concatenation of the results of each batch. """ all_outputs = [ operator(inputs=input, additional_forward_args=additional, target_ind=target, **kwargs) for input, additional, target in _batched_generator( inputs, additional_forward_args, target_ind, internal_batch_size) ] return _reduce_list(all_outputs)
def _evaluate_batch( self, input_list: List, additional_forward_args: Any, correct_fn_kwargs: Optional[Dict[str, Any]], target: TargetType, ) -> Optional[int]: if additional_forward_args is None: additional_forward_args = () all_kwargs = {} if target is not None: all_kwargs["target"] = target if correct_fn_kwargs is not None: all_kwargs.update(correct_fn_kwargs) if len(input_list) == 1: model_out = self.forward_func(input_list[0], *additional_forward_args) out_metric = self.correct_fn(model_out, **all_kwargs) return 0 if not out_metric else None else: batched_inps = _reduce_list(input_list) model_out = self.forward_func(batched_inps, *additional_forward_args) current_count = 0 for i in range(len(input_list)): batch_size = (input_list[i].shape[0] if isinstance( input_list[i], Tensor) else input_list[i][0].shape[0]) out_metric = self.correct_fn( model_out[current_count:current_count + batch_size], **all_kwargs) if not out_metric: return i current_count += batch_size return None
def compute_layer_gradients_and_eval( forward_fn: Callable, layer: ModuleOrModuleList, inputs: Union[Tensor, Tuple[Tensor, ...]], target_ind: TargetType = None, additional_forward_args: Any = None, gradient_neuron_selector: Union[ None, int, Tuple[Union[int, slice], ...], Callable ] = None, device_ids: Union[None, List[int]] = None, attribute_to_layer_input: bool = False, output_fn: Union[None, Callable] = None, ) -> Union[ Tuple[Tuple[Tensor, ...], Tuple[Tensor, ...]], Tuple[Tuple[Tensor, ...], Tuple[Tensor, ...], Tuple[Tensor, ...]], Tuple[List[Tuple[Tensor, ...]], List[Tuple[Tensor, ...]]], ]: r""" Computes gradients of the output with respect to a given layer as well as the output evaluation of the layer for an arbitrary forward function and given input. For data parallel models, hooks are executed once per device ,so we need to internally combine the separated tensors from devices by concatenating based on device_ids. Any necessary gradients must be taken with respect to each independent batched tensor, so the gradients are computed and combined appropriately. More information regarding the behavior of forward hooks with DataParallel models can be found in the PyTorch data parallel documentation. We maintain the separate inputs in a dictionary protected by a lock, analogous to the gather implementation for the core PyTorch DataParallel implementation. NOTE: To properly handle inplace operations, a clone of the layer output is stored. This structure inhibits execution of a backward hook on the last module for the layer output when computing the gradient with respect to the input, since we store an intermediate clone, as opposed to the true module output. If backward module hooks are necessary for the final module when computing input gradients, utilize _forward_layer_eval_with_neuron_grads instead. Args: forward_fn: forward function. This can be for example model's forward function. layer: Layer for which gradients / output will be evaluated. inputs: Input at which gradients are evaluated, will be passed to forward_fn. target_ind: Index of the target class for which gradients must be computed (classification only). output_fn: An optional function that is applied to the layer inputs or outputs depending whether the `attribute_to_layer_input` is set to `True` or `False` args: Additional input arguments that forward function requires. It takes an empty tuple (no additional arguments) if no additional arguments are required Returns: 2-element tuple of **gradients**, **evals**: - **gradients**: Gradients of output with respect to target layer output. - **evals**: Target layer output for given input. """ with torch.autograd.set_grad_enabled(True): # saved_layer is a dictionary mapping device to a tuple of # layer evaluations on that device. saved_layer, output = _forward_layer_distributed_eval( forward_fn, inputs, layer, target_ind=target_ind, additional_forward_args=additional_forward_args, attribute_to_layer_input=attribute_to_layer_input, forward_hook_with_return=True, require_layer_grads=True, ) assert output[0].numel() == 1, ( "Target not provided when necessary, cannot" " take gradient with respect to multiple outputs." ) device_ids = _extract_device_ids(forward_fn, saved_layer, device_ids) # Identifies correct device ordering based on device ids. # key_list is a list of devices in appropriate ordering for concatenation. # If only one key exists (standard model), key list simply has one element. key_list = _sort_key_list( list(next(iter(saved_layer.values())).keys()), device_ids ) all_outputs: Union[Tuple[Tensor, ...], List[Tuple[Tensor, ...]]] if isinstance(layer, Module): all_outputs = _reduce_list( [ saved_layer[layer][device_id] if output_fn is None else output_fn(saved_layer[layer][device_id]) for device_id in key_list ] ) else: all_outputs = [ _reduce_list( [ saved_layer[single_layer][device_id] if output_fn is None else output_fn(saved_layer[single_layer][device_id]) for device_id in key_list ] ) for single_layer in layer ] all_layers: List[Module] = [layer] if isinstance(layer, Module) else layer grad_inputs = tuple( layer_tensor for single_layer in all_layers for device_id in key_list for layer_tensor in saved_layer[single_layer][device_id] ) saved_grads = torch.autograd.grad(torch.unbind(output), grad_inputs) offset = 0 all_grads: List[Tuple[Tensor, ...]] = [] for single_layer in all_layers: num_tensors = len(next(iter(saved_layer[single_layer].values()))) curr_saved_grads = [ saved_grads[i : i + num_tensors] for i in range( offset, offset + len(key_list) * num_tensors, num_tensors ) ] offset += len(key_list) * num_tensors if output_fn is not None: curr_saved_grads = [ output_fn(curr_saved_grad) for curr_saved_grad in curr_saved_grads ] all_grads.append(_reduce_list(curr_saved_grads)) layer_grads: Union[Tuple[Tensor, ...], List[Tuple[Tensor, ...]]] layer_grads = all_grads if isinstance(layer, Module): layer_grads = all_grads[0] if gradient_neuron_selector is not None: assert isinstance( layer, Module ), "Cannot compute neuron gradients for multiple layers simultaneously!" inp_grads = _neuron_gradients( inputs, saved_layer[layer], key_list, gradient_neuron_selector ) return ( cast(Tuple[Tensor, ...], layer_grads), cast(Tuple[Tensor, ...], all_outputs), inp_grads, ) return layer_grads, all_outputs # type: ignore
def test_reduce_list_tensors(self): tensors = [torch.tensor([[3, 4, 5]]), torch.tensor([[0, 1, 2]])] reduced = _reduce_list(tensors) assertTensorAlmostEqual(self, reduced, [[3, 4, 5], [0, 1, 2]])