def test_qoibprop_identity(self): self.assertTrue( np.allclose( self.model.qoi_bprop( MaxClassQoI(), (np.array([[2., 1.], [1., 2.]]),), attribution_cut=InputCut(), to_cut=InputCut()), np.array([[1., 0.], [0., 1.]])))
def test_multiple_inputs(self): class M(Module): def __init__(this): super(M, this).__init__() this.z1 = Linear(5, 6) this.z3 = Linear(7, 7) this.y = Linear(7, 3) def forward(this, x1, x2): x1 = this.z1(x1) z = cat((x1, x2), 1) z = this.z3(z) return this.y(z) model = ModelWrapper(M(), [(5,), (1,)]) infl = InternalInfluence(model, InputCut(), ClassQoI(1), PointDoi()) res = infl.attributions( np.array([[1., 2., 3., 4., 5.]]).astype('float32'), np.array([[1.]]).astype('float32')) self.assertEqual(len(res), 2) self.assertEqual(res[0].shape, (1, 5)) self.assertEqual(res[1].shape, (1, 1))
def test_batch_processing_deep(self): infl = InternalInfluence(self.model_deep, InputCut(), MaxClassQoI(), LinearDoi()) r1 = np.concatenate([infl.attributions(x[None]) for x in self.batch_x]) r2 = infl.attributions(self.batch_x) self.assertTrue(np.allclose(r1, r2))
def test_linear_agreement_multiply_activation(self): c = 1 infl = InternalInfluence( self.model_lin, InputCut(), ClassQoI(c), PointDoi(), multiply_activation=True) res = infl.attributions(self.x) self.assertEqual(res.shape, (2, self.input_size)) self.assertTrue(np.allclose(res, self.model_lin_weights[:, c] * self.x))
def test_distributional_linearity(self): x1, x2 = self.x[0:1], self.x[1:] p1, p2 = 0.25, 0.75 class DistLinDoI(DoI): ''' Represents the distribution of interest that weights `z` with probability 1/4 and `z + diff` with probability 3/4. ''' def __init__(self, diff): super(DistLinDoI, self).__init__() self.diff = diff def __call__(self, z): return [z, z + self.diff, z + self.diff, z + self.diff] infl_pt = InternalInfluence( self.model_deep, InputCut(), ClassQoI(0), PointDoi(), multiply_activation=False) attr1 = infl_pt.attributions(x1) attr2 = infl_pt.attributions(x2) infl_dl = InternalInfluence( self.model_deep, InputCut(), ClassQoI(0), DistLinDoI(x2 - x1), multiply_activation=False) attr12 = infl_dl.attributions(x1) self.assertTrue(np.allclose(attr12, p1 * attr1 + p2 * attr2))
def test_idempotence(self): infl = InternalInfluence( self.model_lin, InputCut(), MaxClassQoI(), PointDoi(), multiply_activation=False) res1 = infl.attributions(self.x) res2 = infl.attributions(self.x) self.assertTrue(np.allclose(res1, res2)) infl_act = InternalInfluence( self.model_lin, InputCut(), MaxClassQoI(), PointDoi(), multiply_activation=True) res1 = infl_act.attributions(self.x) res2 = infl_act.attributions(self.x) self.assertTrue(np.allclose(res1, res2))
def test_completeness_zero_baseline(self): c = 2 infl = InternalInfluence( self.model_deep, InputCut(), ClassQoI(c), LinearDoi(resolution=100), multiply_activation=True) out_x = self.model_deep.fprop((self.x,))[0][:, c] out_baseline = self.model_deep.fprop((self.baseline * 0,))[0][:, c] res = infl.attributions(self.x) self.assertTrue( np.allclose(res.sum(axis=1), out_x - out_baseline, atol=5e-2))
def test_sensitivity(self): c = 2 infl = InternalInfluence( self.model_deep, InputCut(), ClassQoI(c), LinearDoi(self.baseline), multiply_activation=False) out_x = self.model_deep.fprop((self.x[0:1],))[0][:, c] out_baseline = self.model_deep.fprop((self.baseline,))[0][:, c] if not np.allclose(out_x, out_baseline): res = infl.attributions(self.x) self.assertEqual(res.shape, (2, self.input_size)) self.assertNotEqual(res[0, 3], 0.)
def test_multiple_inputs(self): x1 = Input((5, )) z1 = Dense(6)(x1) x2 = Input((1, )) z2 = Concatenate()([z1, x2]) z3 = Dense(7)(z2) y = Dense(3)(z3) model = ModelWrapper(Model([x1, x2], y)) infl = InternalInfluence(model, InputCut(), ClassQoI(1), PointDoi()) res = infl.attributions( [np.array([[1., 2., 3., 4., 5.]]), np.array([[1.]])]) self.assertEqual(len(res), 2) self.assertEqual(res[0].shape, (1, 5)) self.assertEqual(res[1].shape, (1, 1))
def __get_cut(cut_arg): """ Helper function to get a `Cut` object from more user-friendly primitive arguments. """ if isinstance(cut_arg, Cut): # We are already given a Cut, so return it. return cut_arg elif cut_arg is None or cut_arg == 0: # If we receive None or zero, we take it to be the input cut. return InputCut() # TODO(klas): may want a bit more validation here. elif isinstance(cut_arg, int) or isinstance(cut_arg, str): return Cut(cut_arg) else: raise ValueError('Unrecognized argument type for cut')
def test_multiple_inputs(self): graph = Graph() with graph.as_default(): x1 = tf.placeholder('float32', (None, 5)) z1 = x1 @ tf.random.normal((5, 6)) x2 = tf.placeholder('float32', (None, 1)) z2 = tf.concat([z1, x2], axis=1) z3 = z2 @ tf.random.normal((7, 7)) y = z3 @ tf.random.normal((7, 3)) model = ModelWrapper(graph, [x1, x2], y) infl = InternalInfluence(model, InputCut(), ClassQoI(1), PointDoi()) res = infl.attributions( [np.array([[1., 2., 3., 4., 5.]]), np.array([[1.]])]) self.assertEqual(len(res), 2) self.assertEqual(res[0].shape, (1, 5)) self.assertEqual(res[1].shape, (1, 1))
def qoi_bprop(self, qoi, model_args, model_kwargs={}, doi_cut=None, to_cut=None, attribution_cut=None, intervention=None): """ qoi_bprop Run the model from the from_layer to the qoi layer and give the gradients w.r.t `attribution_cut` Parameters ---------- model_args, model_kwargs: The args and kwargs given to the call method of a model. This should represent the instances to obtain attributions for, assumed to be a *batched* input. if `self.model` supports evaluation on *data tensors*, the appropriate tensor type may be used (e.g., Pytorch models may accept Pytorch tensors in additon to `np.ndarray`s). The shape of the inputs must match the input shape of `self.model`. qoi: a Quantity of Interest This method will accumulate all gradients of the qoi w.r.t `attribution_cut` doi_cut: Cut, If `doi_cut` is None, this refers to the InputCut. Cut from which to begin propagation. The shape of `intervention` must match the output shape of this layer. attribution_cut: Cut, optional If `attribution_cut` is None, this refers to the InputCut. The Cut in which attribution will be calculated. This is generally taken from the attribution slyce's attribution_cut. to_cut: Cut, optional If `to_cut` is None, this refers to the OutputCut. The Cut in which qoi will be calculated. This is generally taken from the attribution slice's `to_cut`. intervention : backend.Tensor or np.array Input tensor to propagate through the model. If an np.array, will be converted to a tensor on the same device as the model. Returns ------- (backend.Tensor or np.ndarray) The gradients of `qoi` w.r.t. `attribution_cut`, keeping same type as the input. """ if attribution_cut is None: attribution_cut = InputCut() if to_cut is None: to_cut = OutputCut() doi_cut = doi_cut if doi_cut else InputCut() attribution_tensors = self._get_layers(attribution_cut) to_tensors = self._get_layers(to_cut) doi_tensors = self._get_layers(doi_cut) if intervention is None: intervention = model_args intervention = intervention if isinstance( intervention, DATA_CONTAINER_TYPE) else [intervention] Q = qoi(to_tensors[0]) if len(to_tensors) == 1 else qoi(to_tensors) gradients = [ keras.backend.function( doi_tensors, B.gradient(q, attribution_tensors))(intervention) for q in Q ] if isinstance(Q, DATA_CONTAINER_TYPE) else keras.backend.function( doi_tensors, B.gradient(Q, attribution_tensors))(intervention) return gradients[0] if len(gradients) == 1 else gradients
def __init__(self, model, layer, channel, channel_axis=B.channel_axis, agg_fn=None, doi=None, blur=None, threshold=0.5, masked_opacity=0.2, combine_channels=True, use_attr_as_opacity=None, positive_only=None): """ Configures the default parameters for the `__call__` method (these can be overridden by passing in values to `__call__`). Parameters: model: The wrapped model whose channel we're visualizing. layer: The identifier (either index or name) of the layer in which the channel we're visualizing resides. channel: Index of the channel (for convolutional layers) or internal neuron (for fully-connected layers) that we'd like to visualize. channel_axis: If different from the channel axis specified by the backend, the supplied `channel_axis` will be used if operating on a convolutional layer with 4-D image format. agg_fn: Function with which to aggregate the remaining dimensions (except the batch dimension) in order to get a single scalar value for each channel; If `None`, a sum over each neuron in the channel will be taken. This argument is not used when the channels are scalars, e.g., for dense layers. doi: The distribution of interest to use when computing the input attributions towards the specified channel. If `None`, `PointDoI` will be used. blur: Gives the radius of a Gaussian blur to be applied to the attributions before visualizing. This can be used to help focus on salient regions rather than specific salient pixels. threshold: Value in the range [0, 1]. Attribution values at or below the percentile given by `threshold` (after normalization, blurring, etc.) will be masked. masked_opacity: Value in the range [0, 1] specifying the opacity for the parts of the image that are masked. combine_channels: If `True`, the attributions will be averaged across the channel dimension, resulting in a 1-channel attribution map. use_attr_as_opacity: If `True`, instead of using `threshold` and `masked_opacity`, the opacity of each pixel is given by the 0-1-normalized attribution value. positive_only: If `True`, only pixels with positive attribution will be unmasked (or given nonzero opacity when `use_attr_as_opacity` is true). """ self.mask_visualizer = MaskVisualizer(blur, threshold, masked_opacity, combine_channels, use_attr_as_opacity, positive_only) self.infl_input = InternalInfluence( model, (InputCut(), Cut(layer)), InternalChannelQoI(channel, channel_axis, agg_fn), PointDoi() if doi is None else doi)
def __init__( self, model: ModelWrapper, cut: CutLike = None, qoi: QoiLike = 'max', doi: DoiLike = 'point', multiply_activation: bool = True): """ Parameters: model : Model for which attributions are calculated. cut : The cut determining the layer from which the QoI is derived. Expects a `Cut` object, or a related type that can be interpreted as a `Cut`, as documented below. If an `int` is given, it represents the index of a layer in `model`. If a `str` is given, it represents the name of a layer in `model`. `None` is an alternative for `slices.OutputCut()`. qoi : quantities.QoI | int | tuple | str Quantity of interest to attribute. Expects a `QoI` object, or a related type that can be interpreted as a `QoI`, as documented below. If an `int` is given, the quantity of interest is taken to be the slice output for the class/neuron/channel specified by the given integer, i.e., ```python quantities.InternalChannelQoI(qoi) ``` If a tuple or list of two integers is given, then the quantity of interest is taken to be the comparative quantity for the class given by the first integer against the class given by the second integer, i.e., ```python quantities.ComparativeQoI(*qoi) ``` If a callable is given, it is interpreted as a function representing the QoI, i.e., ```python quantities.LambdaQoI(qoi) ``` If the string, `'max'`, is given, the quantity of interest is taken to be the output for the class with the maximum score, i.e., ```python quantities.MaxClassQoI() ``` doi : distributions.DoI | str Distribution of interest over inputs. Expects a `DoI` object, or a related type that can be interpreted as a `DoI`, as documented below. If the string, `'point'`, is given, the distribution is taken to be the single point passed to `attributions`, i.e., ```python distributions.PointDoi() ``` If the string, `'linear'`, is given, the distribution is taken to be the linear interpolation from the zero input to the point passed to `attributions`, i.e., ```python distributions.LinearDoi() ``` multiply_activation : bool, optional Whether to multiply the gradient result by its corresponding activation, thus converting from "*influence space*" to "*attribution space*." """ super().__init__( model, (InputCut(), cut), qoi, doi, multiply_activation=multiply_activation)
def attributions(self, *model_args, **model_kwargs): doi_cut = self.doi.cut() if self.doi.cut() else InputCut() doi_val = self.model.fprop(model_args, model_kwargs, to_cut=doi_cut) # DoI supports tensor or list of tensor. unwrap args to perform DoI on # top level list # Depending on the model_arg input, the data may be nested in data # containers. We unwrap so that there operations are working on a single # level of data container. if isinstance(doi_val, DATA_CONTAINER_TYPE) and isinstance( doi_val[0], DATA_CONTAINER_TYPE): doi_val = doi_val[0] if isinstance(doi_val, DATA_CONTAINER_TYPE) and len(doi_val) == 1: doi_val = doi_val[0] D = self.doi(doi_val) n_doi = len(D) D = InternalInfluence.__concatenate_doi(D) # Calculate the gradient of each of the points in the DoI. qoi_grads = self.model.qoi_bprop( self.qoi, model_args, model_kwargs, attribution_cut=self.slice.from_cut, to_cut=self.slice.to_cut, intervention=D, doi_cut=doi_cut) # Take the mean across the samples in the DoI. if isinstance(qoi_grads, DATA_CONTAINER_TYPE): attributions = [ B.mean( B.reshape(qoi_grad, (n_doi, -1) + qoi_grad.shape[1:]), axis=0) for qoi_grad in qoi_grads ] else: attributions = B.mean( B.reshape(qoi_grads, (n_doi, -1) + qoi_grads.shape[1:]), axis=0) # Multiply by the activation multiplier if specified. if self._do_multiply: z_val = self.model.fprop( model_args, model_kwargs, to_cut=self.slice.from_cut) if isinstance(z_val, DATA_CONTAINER_TYPE) and len(z_val) == 1: z_val = z_val[0] if isinstance(attributions, DATA_CONTAINER_TYPE): for i in range(len(attributions)): if isinstance(z_val, DATA_CONTAINER_TYPE) and len( z_val) == len(attributions): attributions[i] *= self.doi.get_activation_multiplier( z_val[i]) else: attributions[i] *= ( self.doi.get_activation_multiplier(z_val)) else: attributions *= self.doi.get_activation_multiplier(z_val) return attributions
def qoi_bprop(self, qoi, model_args, model_kwargs={}, doi_cut=None, to_cut=None, attribution_cut=None, intervention=None): """ qoi_bprop Run the model from the from_layer to the qoi layer and give the gradients w.r.t `attribution_cut` Parameters ---------- model_args, model_kwargs: The args and kwargs given to the call method of a model. This should represent the instances to obtain attributions for, assumed to be a *batched* input. if `self.model` supports evaluation on *data tensors*, the appropriate tensor type may be used (e.g., Pytorch models may accept Pytorch tensors in additon to `np.ndarray`s). The shape of the inputs must match the input shape of `self.model`. qoi: a Quantity of Interest This method will accumulate all gradients of the qoi w.r.t `attribution_cut`. doi_cut: Cut, if `doi_cut` is None, this refers to the InputCut. Cut from which to begin propagation. The shape of `intervention` must match the output shape of this layer. attribution_cut: Cut, optional if `attribution_cut` is None, this refers to the InputCut. The Cut in which attribution will be calculated. This is generally taken from the attribution slyce's attribution_cut. to_cut: Cut, optional if `to_cut` is None, this refers to the OutputCut. The Cut in which qoi will be calculated. This is generally taken from the attribution slyce's to_cut. intervention : backend.Tensor or np.array Input tensor to propagate through the model. If an np.array, will be converted to a tensor on the same device as the model. Returns ------- (backend.Tensor or np.ndarray) the gradients of `qoi` w.r.t. `attribution_cut`, keeping same type as the input. """ if attribution_cut is None: attribution_cut = InputCut() if to_cut is None: to_cut = OutputCut() y, zs = self.fprop(model_args, model_kwargs, doi_cut=doi_cut if doi_cut else InputCut(), to_cut=to_cut, attribution_cut=attribution_cut, intervention=intervention, return_tensor=True) y = to_cut.access_layer(y) grads_list = [] for z in zs: z_flat = ModelWrapper._flatten(z) qoi_out = qoi(y) grads_flat = [B.gradient(B.sum(q), z_flat) for q in qoi_out] if isinstance( qoi_out, DATA_CONTAINER_TYPE) else B.gradient( B.sum(qoi_out), z_flat) grads = [ ModelWrapper._unflatten(g, z, count=[0]) for g in grads_flat ] if isinstance(qoi_out, DATA_CONTAINER_TYPE) else ModelWrapper._unflatten( grads_flat, z, count=[0]) grads = [ attribution_cut.access_layer(g) for g in grads ] if isinstance( qoi_out, DATA_CONTAINER_TYPE) else attribution_cut.access_layer(grads) grads = [B.as_array(g) for g in grads] if isinstance( qoi_out, DATA_CONTAINER_TYPE) else B.as_array(grads) grads_list.append(grads) del y # TODO: garbage collection return grads_list[0] if len(grads_list) == 1 else grads_list
def qoi_bprop(self, qoi, model_args, model_kwargs={}, doi_cut=None, to_cut=None, attribution_cut=None, intervention=None): """ qoi_bprop Run the model from the from_layer to the qoi layer and give the gradients w.r.t `attribution_cut` Parameters ---------- model_args, model_kwargs: The args and kwargs given to the call method of a model. This should represent the instances to obtain attributions for, assumed to be a *batched* input. if `self.model` supports evaluation on *data tensors*, the appropriate tensor type may be used (e.g., Pytorch models may accept Pytorch tensors in addition to `np.ndarray`s). The shape of the inputs must match the input shape of `self.model`. qoi: a Quantity of Interest This method will accumulate all gradients of the qoi w.r.t `attribution_cut` doi_cut: Cut, if `doi_cut` is None, this refers to the InputCut. Cut from which to begin propagation. The shape of `intervention` must match the output shape of this layer. attribution_cut: Cut, optional if `attribution_cut` is None, this refers to the InputCut. The Cut in which attribution will be calculated. This is generally taken from the attribution slyce's attribution_cut. to_cut: Cut, optional if `to_cut` is None, this refers to the OutputCut. The Cut in which qoi will be calculated. This is generally taken from the attribution slyce's to_cut. intervention : backend.Tensor or np.array Input tensor to propagate through the model. If an np.array, will be converted to a tensor on the same device as the model. Returns ------- (backend.Tensor or np.ndarray) the gradients of `qoi` w.r.t. `attribution_cut`, keeping same type as the input. """ if intervention is None: intervention = model_args if not self._eager: return super().qoi_bprop(qoi, model_args, model_kwargs, doi_cut, to_cut, attribution_cut, intervention) if attribution_cut is None: attribution_cut = InputCut() if to_cut is None: to_cut = OutputCut() return_numpy = True with tf.GradientTape(persistent=True) as tape: intervention = intervention if isinstance( intervention, DATA_CONTAINER_TYPE) else [intervention] # We return a numpy array if we were given a numpy array; otherwise # we will let the returned values remain data tensors. return_numpy = isinstance(intervention, np.ndarray) or isinstance( intervention[0], np.ndarray) # Convert `intervention` to a data tensor if it isn't already. if return_numpy: intervention = [ ModelWrapper._nested_apply(x_i, tf.constant) for x_i in intervention ] for x_i in intervention: ModelWrapper._nested_apply(x_i, tape.watch) outputs, attribution_features = self.fprop( model_args, model_kwargs, doi_cut=doi_cut if doi_cut else InputCut(), to_cut=to_cut, attribution_cut=attribution_cut, intervention=intervention) if isinstance(outputs, DATA_CONTAINER_TYPE) and isinstance( outputs[0], DATA_CONTAINER_TYPE): outputs = outputs[0] Q = qoi(outputs[0]) if len(outputs) == 1 else qoi(outputs) if isinstance(Q, DATA_CONTAINER_TYPE) and len(Q) == 1: Q = B.sum(Q) grads = [tape.gradient(q, attribution_features) for q in Q ] if isinstance(Q, DATA_CONTAINER_TYPE) else tape.gradient( Q, attribution_features) grads = grads[0] if isinstance( grads, DATA_CONTAINER_TYPE) and len(grads) == 1 else grads grads = [attribution_cut.access_layer(g) for g in grads] if isinstance( grads, DATA_CONTAINER_TYPE) else attribution_cut.access_layer(grads) del tape if return_numpy: grads = [ModelWrapper._nested_apply(g, B.as_array) for g in grads] if isinstance( grads, DATA_CONTAINER_TYPE) else ModelWrapper._nested_apply( grads, B.as_array) return grads[0] if isinstance( grads, DATA_CONTAINER_TYPE) and len(grads) == 1 else grads
def fprop(self, model_args, model_kwargs={}, doi_cut=None, to_cut=None, attribution_cut=None, intervention=None): """ fprop Forward propagate the model Parameters ---------- model_args, model_kwargs: The args and kwargs given to the call method of a model. This should represent the instances to obtain attributions for, assumed to be a *batched* input. if `self.model` supports evaluation on *data tensors*, the appropriate tensor type may be used (e.g., Pytorch models may accept Pytorch tensors in addition to `np.ndarray`s). The shape of the inputs must match the input shape of `self.model`. doi_cut: Cut, optional The Cut from which to begin propagation. The shape of `intervention` must match the input shape of this layer. This is usually used to apply distributions of interest (DoI). to_cut : Cut, optional The Cut to return output activation tensors for. If `None`, assumed to be just the final layer. By default None attribution_cut : Cut, optional An Cut to return activation tensors for. If `None` attributions layer output is not returned. intervention : backend.Tensor or np.array Input tensor to propagate through the model. If an np.array, will be converted to a tensor on the same device as the model. Intervention can also be a `feed_dict`. Returns ------- (list of backend.Tensor or np.ndarray) A list of output activations are returned, keeping same type as the input. If `attribution_cut` is supplied, also return the cut activations. """ if doi_cut is None: doi_cut = InputCut() if to_cut is None: to_cut = OutputCut() doi_tensors = self._get_layers(doi_cut) to_tensors = self._get_layers(to_cut) feed_dict, intervention = self._prepare_feed_dict_with_intervention( model_args, model_kwargs, intervention, doi_tensors) # Tensorlow doesn't allow you to make a function that returns the same # tensor as it takes in. Thus, we have to have a special case for the # identity function. Any tensors that are both in `doi_tensors` and # `to_tensors` cannot be computed via a `keras.backend.function` and # thus need to be taken from the input, `x`. identity_map = { i: j for i, to_tensor in enumerate(to_tensors) for j, from_tensor in enumerate(doi_tensors) if to_tensor == from_tensor } non_identity_to_tensors = [ to_tensor for i, to_tensor in enumerate(to_tensors) if i not in identity_map ] # Compute the output values of `to_tensors` unless all `to_tensor`s were # also `doi_tensors`. if non_identity_to_tensors: out_vals = self._run_session(non_identity_to_tensors, feed_dict) else: out_vals = [] # For any `to_tensor`s that were also `from_tensor`s, insert the # corresponding concrete input value from `x` in the output's place. for i in sorted(identity_map): out_vals.insert(i, intervention[identity_map[i]]) return out_vals
def qoi_bprop(self, qoi, model_args, model_kwargs={}, doi_cut=None, to_cut=None, attribution_cut=None, intervention=None): """ qoi_bprop Run the model from the from_layer to the qoi layer and give the gradients w.r.t `attribution_cut` Parameters ---------- model_args, model_kwargs: The args and kwargs given to the call method of a model. This should represent the instances to obtain attributions for, assumed to be a *batched* input. if `self.model` supports evaluation on *data tensors*, the appropriate tensor type may be used (e.g., Pytorch models may accept Pytorch tensors in addition to `np.ndarray`s). The shape of the inputs must match the input shape of `self.model`. qoi: a Quantity of Interest This method will accumulate all gradients of the qoi w.r.t `attribution_cut`. doi_cut: Cut, if `doi_cut` is None, this refers to the InputCut. Cut from which to begin propagation. The shape of `intervention` must match the output shape of this layer. attribution_cut: Cut, optional if `attribution_cut` is None, this refers to the InputCut. The Cut in which attribution will be calculated. This is generally taken from the attribution slyce's attribution_cut. to_cut: Cut, optional if `to_cut` is None, this refers to the OutputCut. The Cut in which qoi will be calculated. This is generally taken from the attribution slyce's to_cut. intervention : backend.Tensor or np.array Input tensor to propagate through the model. If an np.array, will be converted to a tensor on the same device as the model. intervention can also be a feed_dict Returns ------- (backend.Tensor or np.ndarray) the gradients of `qoi` w.r.t. `attribution_cut`, keeping same type as the input. """ if attribution_cut is None: attribution_cut = InputCut() if to_cut is None: to_cut = OutputCut() doi_cut = doi_cut if doi_cut else InputCut() attribution_tensors = self._get_layers(attribution_cut) to_tensors = self._get_layers(to_cut) doi_tensors = self._get_layers(doi_cut) feed_dict, _ = self._prepare_feed_dict_with_intervention( model_args, model_kwargs, intervention, doi_tensors) z_grads = [] with self._graph.as_default(): for z in attribution_tensors: gradient_tensor_key = (z, frozenset(to_tensors)) if gradient_tensor_key in self._cached_gradient_tensors: grads = self._cached_gradient_tensors[gradient_tensor_key] else: Q = qoi(to_tensors[0]) if len(to_tensors) == 1 else qoi( to_tensors) grads = [B.gradient(q, z)[0] for q in Q] if isinstance( Q, DATA_CONTAINER_TYPE) else B.gradient(Q, z)[0] grads = grads[0] if isinstance( grads, DATA_CONTAINER_TYPE) and len(grads) == 1 else grads grads = [attribution_cut.access_layer(g) for g in grads] if isinstance( grads, DATA_CONTAINER_TYPE ) else attribution_cut.access_layer(grads) self._cached_gradient_tensors[gradient_tensor_key] = grads z_grads.append(grads) grad_flat = ModelWrapper._flatten(z_grads) gradients = [self._run_session(g, feed_dict) for g in grad_flat] gradients = ModelWrapper._unflatten(gradients, z_grads) return gradients[0] if len(gradients) == 1 else gradients
def fprop(self, model_args, model_kwargs={}, doi_cut=None, to_cut=None, attribution_cut=None, intervention=None, return_tensor=False, input_timestep=None): """ fprop Forward propagate the model Parameters ---------- model_args, model_kwargs: The args and kwargs given to the call method of a model. This should represent the instances to obtain attributions for, assumed to be a *batched* input. if `self.model` supports evaluation on *data tensors*, the appropriate tensor type may be used (e.g., Pytorch models may accept Pytorch tensors in additon to `np.ndarray`s). The shape of the inputs must match the input shape of `self.model`. doi_cut: Cut, optional The Cut from which to begin propagation. The shape of `intervention` must match the input shape of this layer. This is usually used to apply distributions of interest (DoI) to_cut : Cut, optional The Cut to return output activation tensors for. If `None`, assumed to be just the final layer. By default None attribution_cut : Cut, optional An Cut to return activation tensors for. If `None` attributions layer output is not returned. intervention : backend.Tensor or np.array Input tensor to propagate through the model. If an np.array, will be converted to a tensor on the same device as the model. input_timestep: int, optional Specifies a specific timestep to apply the DoI if using an RNN Returns ------- (list of backend.Tensor or np.ndarray) A list of output activations are returned, keeping the same type as the input. If `attribution_cut` is supplied, also return the cut activations. """ if doi_cut is None: doi_cut = InputCut() if to_cut is None: to_cut = OutputCut() model_args = self._to_tensor(model_args) if intervention is None: intervention = model_args intervention = intervention if isinstance( intervention, DATA_CONTAINER_TYPE) else [intervention] intervention = self._to_tensor(intervention) if (isinstance(doi_cut, InputCut)): model_args = intervention else: doi_repeated_batch_size = intervention[0].shape[0] batched_model_args = [] for val in model_args: doi_resolution = int(doi_repeated_batch_size / val.shape[0]) tile_shape = [1 for _ in range(len(val.shape))] tile_shape[0] = doi_resolution repeat_shape = tuple(tile_shape) if isinstance(val, np.ndarray): val = np.tile(val, repeat_shape) elif torch.is_tensor(val): val = val.repeat(repeat_shape) batched_model_args.append(val) model_args = batched_model_args if (attribution_cut is not None): # Specify that we want to preserve gradient information. intervention = ModelWrapper._nested_apply( intervention, lambda intervention: intervention.requires_grad_(True)) model_args = ModelWrapper._nested_apply( model_args, lambda model_args: model_args.requires_grad_(True)) # Set up the intervention hookfn if we are starting from an intermediate # layer. if not isinstance(doi_cut, InputCut): # Define the hookfn. counter = 0 def intervene_hookfn(self, inpt, outpt): nonlocal counter, input_timestep, doi_cut, intervention if input_timestep is None or input_timestep == counter: # FIXME: generalize to multi-input layers. Currently can # only intervene on one layer. inpt = inpt[0] if len(inpt) == 1 else inpt if doi_cut.anchor == 'in': ModelWrapper._nested_assign(inpt, intervention[0]) else: ModelWrapper._nested_assign(outpt, intervention[0]) counter += 1 # Register according to the anchor. if doi_cut.anchor == 'in': in_handle = (self._get_layer( doi_cut.name).register_forward_pre_hook( partial(intervene_hookfn, outpt=None))) else: in_handle = (self._get_layer( doi_cut.name).register_forward_hook(intervene_hookfn)) # Collect the names and anchors of the layers we want to return. names_and_anchors = [] self._add_cut_name_and_anchor(to_cut, names_and_anchors) if attribution_cut: self._add_cut_name_and_anchor(attribution_cut, names_and_anchors) # Create hookfns to extract the results from the specified layers. hooks = {} def get_hookfn(layer_name, anchor): def hookfn(self, inpt, outpt): nonlocal hooks, layer_name, anchor # FIXME: generalize to multi-input layers inpt = inpt[0] if len(inpt) == 1 else inpt if return_tensor: if anchor == 'in': hooks[layer_name] = inpt else: # FIXME : will not work for multibranch outputs # needed to ignore hidden states of RNNs outpt = outpt[0] if isinstance(outpt, tuple) else outpt hooks[layer_name] = outpt else: if anchor == 'in': hooks[layer_name] = ModelWrapper._nested_apply( inpt, B.as_array) else: outpt = outpt[0] if isinstance(outpt, tuple) else outpt hooks[layer_name] = ModelWrapper._nested_apply( outpt, B.as_array) return hookfn handles = [ self._get_layer(name).register_forward_hook( get_hookfn(name, anchor)) for name, anchor in names_and_anchors if name is not None ] # Run the network. output = self._model(*model_args, *model_kwargs) if isinstance(output, tuple): output = output[0] if not isinstance(doi_cut, InputCut): # Clean up in handle. in_handle.remove() # Clean up out handles. for handle in handles: handle.remove() if attribution_cut: return [ self._extract_outputs_from_hooks(to_cut, hooks, output, model_args, return_tensor), self._extract_outputs_from_hooks(attribution_cut, hooks, output, model_args, return_tensor) ] else: return self._extract_outputs_from_hooks(to_cut, hooks, output, model_args, return_tensor)
def fprop(self, model_args, model_kwargs={}, doi_cut=None, to_cut=None, attribution_cut=None, intervention=None): """ fprop Forward propagate the model Parameters ---------- model_args, model_kwargs: The args and kwargs given to the call method of a model. This should represent the instances to obtain attributions for, assumed to be a *batched* input. if `self.model` supports evaluation on *data tensors*, the appropriate tensor type may be used (e.g., Pytorch models may accept Pytorch tensors in addition to `np.ndarray`s). The shape of the inputs must match the input shape of `self.model`. doi_cut: Cut, optional The Cut from which to begin propagation. The shape of `intervention` must match the input shape of this layer. This is usually used to apply distributions of interest (DoI) to_cut : Cut, optional The Cut to return output activation tensors for. If `None`, assumed to be just the final layer. By default None attribution_cut : Cut, optional An Cut to return activation tensors for. If `None` attributions layer output is not returned. intervention : backend.Tensor or np.array Input tensor to propagate through the model. If an np.array, will be converted to a tensor on the same device as the model. Returns ------- (list of backend.Tensor or np.ndarray) A list of output activations are returned, preferring to stay in the same format as the input. If `attribution_cut` is supplied, also return the cut activations. """ if not self._eager: return super().fprop(model_args, model_kwargs, doi_cut, to_cut, attribution_cut, intervention) if doi_cut is None: doi_cut = InputCut() if to_cut is None: to_cut = OutputCut() return_numpy = True if intervention is not None: if not isinstance(intervention, DATA_CONTAINER_TYPE): intervention = [intervention] # We return a numpy array if we were given a numpy array; otherwise # we will let the returned values remain data tensors. return_numpy = isinstance(intervention[0], np.ndarray) # Convert `x` to a data tensor if it isn't already. if return_numpy: intervention = ModelWrapper._nested_apply( intervention, tf.constant) try: if (intervention): # Get Inputs and batch then the same as DoI resolution doi_repeated_batch_size = intervention[0].shape[0] batched_model_args = [] for val in model_args: if isinstance(val, np.ndarray): doi_resolution = int(doi_repeated_batch_size / val.shape[0]) tile_shape = [1] * len(val.shape) tile_shape[0] = doi_resolution val = np.tile(val, tuple(tile_shape)) elif tf.is_tensor(val): doi_resolution = int(doi_repeated_batch_size / val.shape[0]) val = tf.repeat(val, doi_resolution, axis=0) batched_model_args.append(val) model_args = batched_model_args if not isinstance(doi_cut, InputCut): from_layers = (self._get_logit_layer() if isinstance( doi_cut, LogitCut) else self._get_output_layer() if isinstance(doi_cut, OutputCut) else self._get_layers_by_name(doi_cut.name)) for layer, x_i in zip(from_layers, intervention): if doi_cut.anchor == 'in': layer.input_intervention = lambda _: x_i else: layer.output_intervention = lambda _: x_i else: arg_wrapped_list = False # Take care of the Keras Module case where args is a tuple # of list of inputs corresponding to `model._inputs`. This # would have gotten unwrapped as the logic operates on the # list of inputs. so needs to be re-wrapped in tuple for the # model arg execution. if (isinstance(model_args, DATA_CONTAINER_TYPE) and isinstance(model_args[0], DATA_CONTAINER_TYPE)): arg_wrapped_list = True model_args = intervention if arg_wrapped_list: model_args = (model_args, ) # Get the output from the "to layers," and possibly the latent # layers. def retrieve_index(i, results, anchor): def retrieve(inputs, output): if anchor == 'in': results[i] = (inputs[0] if (isinstance(inputs, DATA_CONTAINER_TYPE) and len(inputs) == 1) else inputs) else: results[i] = (output[0] if (isinstance(output, DATA_CONTAINER_TYPE) and len(output) == 1) else output) return retrieve if isinstance(to_cut, InputCut): results = model_args else: to_layers = (self._get_logit_layer() if (isinstance( to_cut, LogitCut)) else self._get_output_layer() if (isinstance(to_cut, OutputCut)) else self._get_layers_by_name(to_cut.name)) results = [None for _ in to_layers] for i, layer in enumerate(to_layers): layer.retrieve_functions.append( retrieve_index(i, results, to_cut.anchor)) if attribution_cut: if isinstance(attribution_cut, InputCut): # The attribution must be the watched tensor given from # `qoi_bprop`. attribution_results = intervention else: attribution_layers = ( self._get_logit_layer() if (isinstance(attribution_cut, LogitCut)) else self._get_output_layer() if (isinstance(attribution_cut, OutputCut)) else self._get_layers_by_name(attribution_cut.name)) attribution_results = [None for _ in attribution_layers] for i, layer in enumerate(attribution_layers): if self._is_input_layer(layer): # Input layers don't end up calling the hook, so we # have to get their output manually. attribution_results[i] = intervention[ self._input_layer_index(layer)] else: layer.retrieve_functions.append( retrieve_index(i, attribution_results, attribution_cut.anchor)) # Run a point. self._model(*model_args, **model_kwargs) finally: # Clear the hooks after running the model so that `fprop` doesn't # leave the model in an altered state. self._clear_hooks() if return_numpy: results = ModelWrapper._nested_apply( results, lambda t: t.numpy() if not isinstance(t, np.ndarray) else t) return (results, attribution_results) if attribution_cut else results