def loss_sensitivity(x, classifier, sess): """ Local loss sensitivity estimated through the gradients of the loss at points in `x`, as defined in https://arxiv.org/pdf/1706.05394.pdf. :param x: Data sample of shape that can be fed into `classifier` :type x: `np.ndarray` :param classifier: A trained model :type classifier: :class:`Classifier` :param sess: The session for the computation :type sess: `tf.Session` :return: The average loss sensitivity of the model :rtype: `float` """ from art.attacks.attack import class_derivative x_op = tf.placeholder(dtype=tf.float32, shape=list(x.shape)) y_pred = classifier.predict(x) indices = np.argmax(y_pred, axis=1) grads = class_derivative(classifier._get_predictions(x_op, log=True), x_op, classifier.model.get_output_shape_at(0)[1]) res = sess.run(grads, feed_dict={x_op: x}) res = np.asarray([r[0] for r in res])[indices, list(range(x.shape[0]))] res = la.norm(res.reshape(res.shape[0], -1), ord=2, axis=1) return np.mean(res)
def generate(self, x_val, **kwargs): """ Generate adversarial samples and return them in an array. :param x_val: An array with the original inputs to be attacked. :type x_val: `np.ndarray` :param y_val: Target values if the attack is targeted :type y_val: `np.ndarray` :param theta: Perturbation introduced to each modified feature per step (can be positive or negative) :type theta: `float` :param gamma: Maximum percentage of perturbed features (between 0 and 1) :type gamma: `float` :param clip_min: Minimum input component value. :type clip_min: `float` :param clip_max: Maximum input component value. :type clip_max: `float` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ # Parse and save attack-specific parameters assert self.set_params(**kwargs) k.set_learning_phase(0) # Initialize variables dims = [None] + list(x_val.shape[1:]) self._x = tf.placeholder(tf.float32, shape=dims) dims[0] = 1 x_adv = np.copy(x_val) self._nb_features = np.product(x_adv.shape[1:]) self._nb_classes = self.model.output_shape[1] x_adv = np.reshape(x_adv, (-1, self._nb_features)) preds = self.sess.run(tf.argmax(self.classifier.model(self._x), axis=1), {self._x: x_val}) loss = self.classifier._get_predictions(self._x, log=False) self._grads = class_derivative(loss, self._x, self._nb_classes) # Set number of iterations w.r.t. the total perturbation allowed max_iter = np.floor(self._nb_features * self.gamma / 2) # Determine target classes for attack if 'y_val' not in kwargs or kwargs[str('y_val')] is None: # Randomly choose target from the incorrect classes for each sample from art.utils import random_targets targets = np.argmax(random_targets(preds, self._nb_classes), axis=1) else: targets = kwargs[str('y_val')] # Generate the adversarial samples for ind, val in enumerate(x_adv): # Initialize the search space; optimize to remove features that can't be changed if self.theta > 0: search_space = set([i for i in range(self._nb_features) if val[i] < self.clip_max]) else: search_space = set([i for i in range(self._nb_features) if val[i] > self.clip_min]) nb_iter = 0 current_pred = preds[ind] while current_pred != targets[ind] and nb_iter < max_iter and bool(search_space): # Compute saliency map feat1, feat2 = self._saliency_map(np.reshape(val, dims), targets[ind], search_space) # Move on to next examples if there are no more features to change if feat1 == feat2 == 0: break # Prepare update if self.theta > 0: clip_func, clip_value = np.minimum, self.clip_max else: clip_func, clip_value = np.maximum, self.clip_min # Update adversarial example for feature_ind in [feat1, feat2]: # unraveled_ind = np.unravel_index(feature_ind, dims) val[feature_ind] = clip_func(clip_value, val[feature_ind] + self.theta) # Remove indices from search space if max/min values were reached if val[feature_ind] == clip_value: search_space.discard(feature_ind) # Recompute model prediction current_pred = self.sess.run(tf.argmax(self.classifier.model(self._x), axis=1), {self._x: np.reshape(val, dims)}) nb_iter += 1 x_adv = np.reshape(x_adv, x_val.shape) return x_adv
def generate(self, x_val, **kwargs): """ Generate adversarial samples and return them in a Numpy array. :param x_val: An array with the original inputs to be attacked. :type x_val: `np.ndarray` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ assert self.set_params(**kwargs) dims = list(x_val.shape) dims[0] = None nb_classes = self.model.output_shape[1] xi_op = tf.placeholder(dtype=tf.float32, shape=dims) loss = self.classifier.model(xi_op) grads_graph = class_derivative(loss, xi_op, nb_classes) x_adv = x_val.copy() # Progress bar progress_bar = Progbar(target=len(x_val), verbose=self.verbose) # Initialize variables y_pred = self.classifier.model.predict(x_val) pred_class = np.argmax(y_pred, axis=1) # Main algorithm for each example for j, x in enumerate(x_adv): xi = x[None, ...] norm_x0 = np.linalg.norm(np.reshape(x, [-1])) l = pred_class[j] #d = np.zeros(shape=dims[1:]) # Main loop of the algorithm for i in range(self.max_iter): # Compute score score = self.classifier.model.predict(xi)[0][l] # Compute the gradients and norm grads = self.sess.run(grads_graph, feed_dict={xi_op: xi})[l][0] norm_grad = np.linalg.norm(np.reshape(grads, [-1])) # Theta theta = self._compute_theta(norm_x0, score, norm_grad, nb_classes) # Pertubation di = self._compute_pert(theta, grads, norm_grad) # Update xi and pertubation xi += di #d += di # Return the adversarial example x_adv[j] = xi[0] progress_bar.update( current=j, values=[("perturbation", abs(np.linalg.norm( (x_adv[j] - x_val[j]).flatten())))]) return x_adv
def generate(self, x_val, **kwargs): """ Generate adversarial samples and return them in an array. :param x_val: An array with the original inputs to be attacked. :type x_val: `np.ndarray` :param max_iter: The maximum number of iterations. :type max_iter: `int` :param clip_min: Minimum input component value. :type clip_min: `float` :param clip_max: Maximum input component value. :type clip_max: `float` :return: An array holding the adversarial examples. :rtype: `np.ndarray` """ assert self.set_params(**kwargs) k.set_learning_phase(0) dims = list(x_val.shape) nb_instances = dims[0] dims[0] = None nb_classes = self.model.output_shape[1] xi_op = tf.placeholder(dtype=tf.float32, shape=dims) loss = self.classifier._get_predictions(xi_op, log=True) grads = class_derivative(loss, xi_op, nb_classes) x_adv = x_val.copy() # Progress bar progress_bar = Progbar(target=len(x_val), verbose=self.verbose) for j, x in enumerate(x_adv): xi = x[None, ...] f, grd = self.sess.run([self.model(xi_op), grads], {xi_op: xi}) f, grd = f[0], [g[0] for g in grd] fk_hat = np.argmax(f) fk_i_hat = fk_hat nb_iter = 0 while fk_i_hat == fk_hat and nb_iter < self.max_iter: grad_diff = grd - grd[fk_hat] f_diff = f - f[fk_hat] # Masking true label mask = [0] * nb_classes mask[fk_hat] = 1 value = np.ma.array(np.abs(f_diff)/np.linalg.norm(grad_diff.reshape(nb_classes, -1), axis=1), mask=mask) l = value.argmin(fill_value=np.inf) r = (abs(f_diff[l])/pow(np.linalg.norm(grad_diff[l]), 2)) * grad_diff[l] # Add perturbation and clip result xi += r if self.clip_min or self.clip_max: xi = np.clip(xi, self.clip_min, self.clip_max) # Recompute prediction for new xi f, grd = self.sess.run([self.model(xi_op), grads], {xi_op: xi}) f, grd = f[0], [g[0] for g in grd] fk_i_hat = np.argmax(f) nb_iter += 1 x_adv[j] = xi[0] progress_bar.update(current=j, values=[("perturbation", abs(np.linalg.norm((x_adv[j]-x_val[j]).flatten())))]) true_y = self.model.predict(x_val) adv_y = self.model.predict(x_adv) fooling_rate = np.sum(true_y != adv_y) / nb_instances self.fooling_rate = fooling_rate self.converged = (nb_iter < self.max_iter) self.v = np.mean(np.abs(np.linalg.norm((x_adv-x_val).reshape(nb_instances, -1), axis=1))) return x_adv