def intervene_hookfn(self, inpt, outpt): nonlocal counter, input_timestep, doi_cut, intervention if input_timestep is None or input_timestep == counter: # FIXME: generalize to multi-input layers. Currently can # only intervene on one layer. inpt = inpt[0] if len(inpt) == 1 else inpt if doi_cut.anchor == 'in': ModelWrapper._nested_assign(inpt, intervention[0]) else: ModelWrapper._nested_assign(outpt, intervention[0]) counter += 1
def _to_tensor(self, x): # Convert `x` to a tensor on `self.device`. Note that layer input can be # a nested DATA_CONTAINER_TYPE. if isinstance(x, np.ndarray) or isinstance(x[0], np.ndarray): x = ModelWrapper._nested_apply( x, partial(B.as_tensor, device=self.device)) elif isinstance(x, DATA_CONTAINER_TYPE): x = [self._to_tensor(x_i) for x_i in x] else: x = ModelWrapper._nested_apply(x, lambda x: x.to(self.device)) return x
def hookfn(self, inpt, outpt): nonlocal hooks, layer_name, anchor # FIXME: generalize to multi-input layers inpt = inpt[0] if len(inpt) == 1 else inpt if return_tensor: if anchor == 'in': hooks[layer_name] = inpt else: # FIXME : will not work for multibranch outputs # needed to ignore hidden states of RNNs outpt = outpt[0] if isinstance(outpt, tuple) else outpt hooks[layer_name] = outpt else: if anchor == 'in': hooks[layer_name] = ModelWrapper._nested_apply( inpt, B.as_array) else: outpt = outpt[0] if isinstance(outpt, tuple) else outpt hooks[layer_name] = ModelWrapper._nested_apply( outpt, B.as_array)
def qoi_bprop(self, qoi, model_args, model_kwargs={}, doi_cut=None, to_cut=None, attribution_cut=None, intervention=None): """ qoi_bprop Run the model from the from_layer to the qoi layer and give the gradients w.r.t `attribution_cut` Parameters ---------- model_args, model_kwargs: The args and kwargs given to the call method of a model. This should represent the instances to obtain attributions for, assumed to be a *batched* input. if `self.model` supports evaluation on *data tensors*, the appropriate tensor type may be used (e.g., Pytorch models may accept Pytorch tensors in additon to `np.ndarray`s). The shape of the inputs must match the input shape of `self.model`. qoi: a Quantity of Interest This method will accumulate all gradients of the qoi w.r.t `attribution_cut`. doi_cut: Cut, if `doi_cut` is None, this refers to the InputCut. Cut from which to begin propagation. The shape of `intervention` must match the output shape of this layer. attribution_cut: Cut, optional if `attribution_cut` is None, this refers to the InputCut. The Cut in which attribution will be calculated. This is generally taken from the attribution slyce's attribution_cut. to_cut: Cut, optional if `to_cut` is None, this refers to the OutputCut. The Cut in which qoi will be calculated. This is generally taken from the attribution slyce's to_cut. intervention : backend.Tensor or np.array Input tensor to propagate through the model. If an np.array, will be converted to a tensor on the same device as the model. Returns ------- (backend.Tensor or np.ndarray) the gradients of `qoi` w.r.t. `attribution_cut`, keeping same type as the input. """ if attribution_cut is None: attribution_cut = InputCut() if to_cut is None: to_cut = OutputCut() y, zs = self.fprop(model_args, model_kwargs, doi_cut=doi_cut if doi_cut else InputCut(), to_cut=to_cut, attribution_cut=attribution_cut, intervention=intervention, return_tensor=True) y = to_cut.access_layer(y) grads_list = [] for z in zs: z_flat = ModelWrapper._flatten(z) qoi_out = qoi(y) grads_flat = [B.gradient(B.sum(q), z_flat) for q in qoi_out] if isinstance( qoi_out, DATA_CONTAINER_TYPE) else B.gradient( B.sum(qoi_out), z_flat) grads = [ ModelWrapper._unflatten(g, z, count=[0]) for g in grads_flat ] if isinstance(qoi_out, DATA_CONTAINER_TYPE) else ModelWrapper._unflatten( grads_flat, z, count=[0]) grads = [ attribution_cut.access_layer(g) for g in grads ] if isinstance( qoi_out, DATA_CONTAINER_TYPE) else attribution_cut.access_layer(grads) grads = [B.as_array(g) for g in grads] if isinstance( qoi_out, DATA_CONTAINER_TYPE) else B.as_array(grads) grads_list.append(grads) del y # TODO: garbage collection return grads_list[0] if len(grads_list) == 1 else grads_list
def fprop(self, model_args, model_kwargs={}, doi_cut=None, to_cut=None, attribution_cut=None, intervention=None, return_tensor=False, input_timestep=None): """ fprop Forward propagate the model Parameters ---------- model_args, model_kwargs: The args and kwargs given to the call method of a model. This should represent the instances to obtain attributions for, assumed to be a *batched* input. if `self.model` supports evaluation on *data tensors*, the appropriate tensor type may be used (e.g., Pytorch models may accept Pytorch tensors in additon to `np.ndarray`s). The shape of the inputs must match the input shape of `self.model`. doi_cut: Cut, optional The Cut from which to begin propagation. The shape of `intervention` must match the input shape of this layer. This is usually used to apply distributions of interest (DoI) to_cut : Cut, optional The Cut to return output activation tensors for. If `None`, assumed to be just the final layer. By default None attribution_cut : Cut, optional An Cut to return activation tensors for. If `None` attributions layer output is not returned. intervention : backend.Tensor or np.array Input tensor to propagate through the model. If an np.array, will be converted to a tensor on the same device as the model. input_timestep: int, optional Specifies a specific timestep to apply the DoI if using an RNN Returns ------- (list of backend.Tensor or np.ndarray) A list of output activations are returned, keeping the same type as the input. If `attribution_cut` is supplied, also return the cut activations. """ if doi_cut is None: doi_cut = InputCut() if to_cut is None: to_cut = OutputCut() model_args = self._to_tensor(model_args) if intervention is None: intervention = model_args intervention = intervention if isinstance( intervention, DATA_CONTAINER_TYPE) else [intervention] intervention = self._to_tensor(intervention) if (isinstance(doi_cut, InputCut)): model_args = intervention else: doi_repeated_batch_size = intervention[0].shape[0] batched_model_args = [] for val in model_args: doi_resolution = int(doi_repeated_batch_size / val.shape[0]) tile_shape = [1 for _ in range(len(val.shape))] tile_shape[0] = doi_resolution repeat_shape = tuple(tile_shape) if isinstance(val, np.ndarray): val = np.tile(val, repeat_shape) elif torch.is_tensor(val): val = val.repeat(repeat_shape) batched_model_args.append(val) model_args = batched_model_args if (attribution_cut is not None): # Specify that we want to preserve gradient information. intervention = ModelWrapper._nested_apply( intervention, lambda intervention: intervention.requires_grad_(True)) model_args = ModelWrapper._nested_apply( model_args, lambda model_args: model_args.requires_grad_(True)) # Set up the intervention hookfn if we are starting from an intermediate # layer. if not isinstance(doi_cut, InputCut): # Define the hookfn. counter = 0 def intervene_hookfn(self, inpt, outpt): nonlocal counter, input_timestep, doi_cut, intervention if input_timestep is None or input_timestep == counter: # FIXME: generalize to multi-input layers. Currently can # only intervene on one layer. inpt = inpt[0] if len(inpt) == 1 else inpt if doi_cut.anchor == 'in': ModelWrapper._nested_assign(inpt, intervention[0]) else: ModelWrapper._nested_assign(outpt, intervention[0]) counter += 1 # Register according to the anchor. if doi_cut.anchor == 'in': in_handle = (self._get_layer( doi_cut.name).register_forward_pre_hook( partial(intervene_hookfn, outpt=None))) else: in_handle = (self._get_layer( doi_cut.name).register_forward_hook(intervene_hookfn)) # Collect the names and anchors of the layers we want to return. names_and_anchors = [] self._add_cut_name_and_anchor(to_cut, names_and_anchors) if attribution_cut: self._add_cut_name_and_anchor(attribution_cut, names_and_anchors) # Create hookfns to extract the results from the specified layers. hooks = {} def get_hookfn(layer_name, anchor): def hookfn(self, inpt, outpt): nonlocal hooks, layer_name, anchor # FIXME: generalize to multi-input layers inpt = inpt[0] if len(inpt) == 1 else inpt if return_tensor: if anchor == 'in': hooks[layer_name] = inpt else: # FIXME : will not work for multibranch outputs # needed to ignore hidden states of RNNs outpt = outpt[0] if isinstance(outpt, tuple) else outpt hooks[layer_name] = outpt else: if anchor == 'in': hooks[layer_name] = ModelWrapper._nested_apply( inpt, B.as_array) else: outpt = outpt[0] if isinstance(outpt, tuple) else outpt hooks[layer_name] = ModelWrapper._nested_apply( outpt, B.as_array) return hookfn handles = [ self._get_layer(name).register_forward_hook( get_hookfn(name, anchor)) for name, anchor in names_and_anchors if name is not None ] # Run the network. output = self._model(*model_args, *model_kwargs) if isinstance(output, tuple): output = output[0] if not isinstance(doi_cut, InputCut): # Clean up in handle. in_handle.remove() # Clean up out handles. for handle in handles: handle.remove() if attribution_cut: return [ self._extract_outputs_from_hooks(to_cut, hooks, output, model_args, return_tensor), self._extract_outputs_from_hooks(attribution_cut, hooks, output, model_args, return_tensor) ] else: return self._extract_outputs_from_hooks(to_cut, hooks, output, model_args, return_tensor)
def _extract_outputs_from_hooks(self, cut, hooks, output, model_input, return_tensor): if isinstance(cut, OutputCut): return (ModelWrapper._flatten(output) if return_tensor else ModelWrapper._nested_apply( ModelWrapper._flatten(output), B.as_array)) elif isinstance(cut, InputCut): return (ModelWrapper._flatten(model_input) if return_tensor else ModelWrapper._nested_apply( ModelWrapper._flatten(model_input), B.as_array)) elif isinstance(cut, LogitCut): y = hooks['logits' if self._logit_layer is None else self. _logit_layer] return (ModelWrapper._flatten(y) if return_tensor else ModelWrapper._nested_apply( ModelWrapper._flatten(y), B.as_array)) elif isinstance(cut.name, DATA_CONTAINER_TYPE): zs = [hooks[name] for name in cut.name] return (ModelWrapper._flatten(zs) if return_tensor else ModelWrapper._nested_apply( ModelWrapper._flatten(zs), B.as_array)) else: z = hooks[cut.name] return (ModelWrapper._flatten(z) if return_tensor else ModelWrapper._nested_apply( ModelWrapper._flatten(z), B.as_array))
def qoi_bprop(self, qoi, model_args, model_kwargs={}, doi_cut=None, to_cut=None, attribution_cut=None, intervention=None): """ qoi_bprop Run the model from the from_layer to the qoi layer and give the gradients w.r.t `attribution_cut` Parameters ---------- model_args, model_kwargs: The args and kwargs given to the call method of a model. This should represent the instances to obtain attributions for, assumed to be a *batched* input. if `self.model` supports evaluation on *data tensors*, the appropriate tensor type may be used (e.g., Pytorch models may accept Pytorch tensors in addition to `np.ndarray`s). The shape of the inputs must match the input shape of `self.model`. qoi: a Quantity of Interest This method will accumulate all gradients of the qoi w.r.t `attribution_cut`. doi_cut: Cut, if `doi_cut` is None, this refers to the InputCut. Cut from which to begin propagation. The shape of `intervention` must match the output shape of this layer. attribution_cut: Cut, optional if `attribution_cut` is None, this refers to the InputCut. The Cut in which attribution will be calculated. This is generally taken from the attribution slyce's attribution_cut. to_cut: Cut, optional if `to_cut` is None, this refers to the OutputCut. The Cut in which qoi will be calculated. This is generally taken from the attribution slyce's to_cut. intervention : backend.Tensor or np.array Input tensor to propagate through the model. If an np.array, will be converted to a tensor on the same device as the model. intervention can also be a feed_dict Returns ------- (backend.Tensor or np.ndarray) the gradients of `qoi` w.r.t. `attribution_cut`, keeping same type as the input. """ if attribution_cut is None: attribution_cut = InputCut() if to_cut is None: to_cut = OutputCut() doi_cut = doi_cut if doi_cut else InputCut() attribution_tensors = self._get_layers(attribution_cut) to_tensors = self._get_layers(to_cut) doi_tensors = self._get_layers(doi_cut) feed_dict, _ = self._prepare_feed_dict_with_intervention( model_args, model_kwargs, intervention, doi_tensors) z_grads = [] with self._graph.as_default(): for z in attribution_tensors: gradient_tensor_key = (z, frozenset(to_tensors)) if gradient_tensor_key in self._cached_gradient_tensors: grads = self._cached_gradient_tensors[gradient_tensor_key] else: Q = qoi(to_tensors[0]) if len(to_tensors) == 1 else qoi( to_tensors) grads = [B.gradient(q, z)[0] for q in Q] if isinstance( Q, DATA_CONTAINER_TYPE) else B.gradient(Q, z)[0] grads = grads[0] if isinstance( grads, DATA_CONTAINER_TYPE) and len(grads) == 1 else grads grads = [attribution_cut.access_layer(g) for g in grads] if isinstance( grads, DATA_CONTAINER_TYPE ) else attribution_cut.access_layer(grads) self._cached_gradient_tensors[gradient_tensor_key] = grads z_grads.append(grads) grad_flat = ModelWrapper._flatten(z_grads) gradients = [self._run_session(g, feed_dict) for g in grad_flat] gradients = ModelWrapper._unflatten(gradients, z_grads) return gradients[0] if len(gradients) == 1 else gradients
def qoi_bprop(self, qoi, model_args, model_kwargs={}, doi_cut=None, to_cut=None, attribution_cut=None, intervention=None): """ qoi_bprop Run the model from the from_layer to the qoi layer and give the gradients w.r.t `attribution_cut` Parameters ---------- model_args, model_kwargs: The args and kwargs given to the call method of a model. This should represent the instances to obtain attributions for, assumed to be a *batched* input. if `self.model` supports evaluation on *data tensors*, the appropriate tensor type may be used (e.g., Pytorch models may accept Pytorch tensors in addition to `np.ndarray`s). The shape of the inputs must match the input shape of `self.model`. qoi: a Quantity of Interest This method will accumulate all gradients of the qoi w.r.t `attribution_cut` doi_cut: Cut, if `doi_cut` is None, this refers to the InputCut. Cut from which to begin propagation. The shape of `intervention` must match the output shape of this layer. attribution_cut: Cut, optional if `attribution_cut` is None, this refers to the InputCut. The Cut in which attribution will be calculated. This is generally taken from the attribution slyce's attribution_cut. to_cut: Cut, optional if `to_cut` is None, this refers to the OutputCut. The Cut in which qoi will be calculated. This is generally taken from the attribution slyce's to_cut. intervention : backend.Tensor or np.array Input tensor to propagate through the model. If an np.array, will be converted to a tensor on the same device as the model. Returns ------- (backend.Tensor or np.ndarray) the gradients of `qoi` w.r.t. `attribution_cut`, keeping same type as the input. """ if intervention is None: intervention = model_args if not self._eager: return super().qoi_bprop(qoi, model_args, model_kwargs, doi_cut, to_cut, attribution_cut, intervention) if attribution_cut is None: attribution_cut = InputCut() if to_cut is None: to_cut = OutputCut() return_numpy = True with tf.GradientTape(persistent=True) as tape: intervention = intervention if isinstance( intervention, DATA_CONTAINER_TYPE) else [intervention] # We return a numpy array if we were given a numpy array; otherwise # we will let the returned values remain data tensors. return_numpy = isinstance(intervention, np.ndarray) or isinstance( intervention[0], np.ndarray) # Convert `intervention` to a data tensor if it isn't already. if return_numpy: intervention = [ ModelWrapper._nested_apply(x_i, tf.constant) for x_i in intervention ] for x_i in intervention: ModelWrapper._nested_apply(x_i, tape.watch) outputs, attribution_features = self.fprop( model_args, model_kwargs, doi_cut=doi_cut if doi_cut else InputCut(), to_cut=to_cut, attribution_cut=attribution_cut, intervention=intervention) if isinstance(outputs, DATA_CONTAINER_TYPE) and isinstance( outputs[0], DATA_CONTAINER_TYPE): outputs = outputs[0] Q = qoi(outputs[0]) if len(outputs) == 1 else qoi(outputs) if isinstance(Q, DATA_CONTAINER_TYPE) and len(Q) == 1: Q = B.sum(Q) grads = [tape.gradient(q, attribution_features) for q in Q ] if isinstance(Q, DATA_CONTAINER_TYPE) else tape.gradient( Q, attribution_features) grads = grads[0] if isinstance( grads, DATA_CONTAINER_TYPE) and len(grads) == 1 else grads grads = [attribution_cut.access_layer(g) for g in grads] if isinstance( grads, DATA_CONTAINER_TYPE) else attribution_cut.access_layer(grads) del tape if return_numpy: grads = [ModelWrapper._nested_apply(g, B.as_array) for g in grads] if isinstance( grads, DATA_CONTAINER_TYPE) else ModelWrapper._nested_apply( grads, B.as_array) return grads[0] if isinstance( grads, DATA_CONTAINER_TYPE) and len(grads) == 1 else grads
def fprop(self, model_args, model_kwargs={}, doi_cut=None, to_cut=None, attribution_cut=None, intervention=None): """ fprop Forward propagate the model Parameters ---------- model_args, model_kwargs: The args and kwargs given to the call method of a model. This should represent the instances to obtain attributions for, assumed to be a *batched* input. if `self.model` supports evaluation on *data tensors*, the appropriate tensor type may be used (e.g., Pytorch models may accept Pytorch tensors in addition to `np.ndarray`s). The shape of the inputs must match the input shape of `self.model`. doi_cut: Cut, optional The Cut from which to begin propagation. The shape of `intervention` must match the input shape of this layer. This is usually used to apply distributions of interest (DoI) to_cut : Cut, optional The Cut to return output activation tensors for. If `None`, assumed to be just the final layer. By default None attribution_cut : Cut, optional An Cut to return activation tensors for. If `None` attributions layer output is not returned. intervention : backend.Tensor or np.array Input tensor to propagate through the model. If an np.array, will be converted to a tensor on the same device as the model. Returns ------- (list of backend.Tensor or np.ndarray) A list of output activations are returned, preferring to stay in the same format as the input. If `attribution_cut` is supplied, also return the cut activations. """ if not self._eager: return super().fprop(model_args, model_kwargs, doi_cut, to_cut, attribution_cut, intervention) if doi_cut is None: doi_cut = InputCut() if to_cut is None: to_cut = OutputCut() return_numpy = True if intervention is not None: if not isinstance(intervention, DATA_CONTAINER_TYPE): intervention = [intervention] # We return a numpy array if we were given a numpy array; otherwise # we will let the returned values remain data tensors. return_numpy = isinstance(intervention[0], np.ndarray) # Convert `x` to a data tensor if it isn't already. if return_numpy: intervention = ModelWrapper._nested_apply( intervention, tf.constant) try: if (intervention): # Get Inputs and batch then the same as DoI resolution doi_repeated_batch_size = intervention[0].shape[0] batched_model_args = [] for val in model_args: if isinstance(val, np.ndarray): doi_resolution = int(doi_repeated_batch_size / val.shape[0]) tile_shape = [1] * len(val.shape) tile_shape[0] = doi_resolution val = np.tile(val, tuple(tile_shape)) elif tf.is_tensor(val): doi_resolution = int(doi_repeated_batch_size / val.shape[0]) val = tf.repeat(val, doi_resolution, axis=0) batched_model_args.append(val) model_args = batched_model_args if not isinstance(doi_cut, InputCut): from_layers = (self._get_logit_layer() if isinstance( doi_cut, LogitCut) else self._get_output_layer() if isinstance(doi_cut, OutputCut) else self._get_layers_by_name(doi_cut.name)) for layer, x_i in zip(from_layers, intervention): if doi_cut.anchor == 'in': layer.input_intervention = lambda _: x_i else: layer.output_intervention = lambda _: x_i else: arg_wrapped_list = False # Take care of the Keras Module case where args is a tuple # of list of inputs corresponding to `model._inputs`. This # would have gotten unwrapped as the logic operates on the # list of inputs. so needs to be re-wrapped in tuple for the # model arg execution. if (isinstance(model_args, DATA_CONTAINER_TYPE) and isinstance(model_args[0], DATA_CONTAINER_TYPE)): arg_wrapped_list = True model_args = intervention if arg_wrapped_list: model_args = (model_args, ) # Get the output from the "to layers," and possibly the latent # layers. def retrieve_index(i, results, anchor): def retrieve(inputs, output): if anchor == 'in': results[i] = (inputs[0] if (isinstance(inputs, DATA_CONTAINER_TYPE) and len(inputs) == 1) else inputs) else: results[i] = (output[0] if (isinstance(output, DATA_CONTAINER_TYPE) and len(output) == 1) else output) return retrieve if isinstance(to_cut, InputCut): results = model_args else: to_layers = (self._get_logit_layer() if (isinstance( to_cut, LogitCut)) else self._get_output_layer() if (isinstance(to_cut, OutputCut)) else self._get_layers_by_name(to_cut.name)) results = [None for _ in to_layers] for i, layer in enumerate(to_layers): layer.retrieve_functions.append( retrieve_index(i, results, to_cut.anchor)) if attribution_cut: if isinstance(attribution_cut, InputCut): # The attribution must be the watched tensor given from # `qoi_bprop`. attribution_results = intervention else: attribution_layers = ( self._get_logit_layer() if (isinstance(attribution_cut, LogitCut)) else self._get_output_layer() if (isinstance(attribution_cut, OutputCut)) else self._get_layers_by_name(attribution_cut.name)) attribution_results = [None for _ in attribution_layers] for i, layer in enumerate(attribution_layers): if self._is_input_layer(layer): # Input layers don't end up calling the hook, so we # have to get their output manually. attribution_results[i] = intervention[ self._input_layer_index(layer)] else: layer.retrieve_functions.append( retrieve_index(i, attribution_results, attribution_cut.anchor)) # Run a point. self._model(*model_args, **model_kwargs) finally: # Clear the hooks after running the model so that `fprop` doesn't # leave the model in an altered state. self._clear_hooks() if return_numpy: results = ModelWrapper._nested_apply( results, lambda t: t.numpy() if not isinstance(t, np.ndarray) else t) return (results, attribution_results) if attribution_cut else results