def _project_perturbation(perturbation, epsilon, input_image, clip_min=None, clip_max=None): """Project `perturbation` onto L-infinity ball of radius `epsilon`. Also project into hypercube such that the resulting adversarial example is between clip_min and clip_max, if applicable. """ if clip_min is None or clip_max is None: raise NotImplementedError( "_project_perturbation currently has clipping " "hard-coded in.") # Ensure inputs are in the correct range with tf.control_dependencies([ utils_tf.assert_less_equal(input_image, tf.cast(clip_max, input_image.dtype)), utils_tf.assert_greater_equal(input_image, tf.cast(clip_min, input_image.dtype)), ]): clipped_perturbation = utils_tf.clip_by_value(perturbation, -epsilon, epsilon) new_image = utils_tf.clip_by_value(input_image + clipped_perturbation, clip_min, clip_max) return new_image - input_image
def test_clip_by_value_numpy_dtype(self): # Test that it's possible to use clip_by_value while mixing numpy and tf clip_min = np.zeros((1,)) clip_max = tf.ones((1,)) x = tf.ones((1,)) # The point of this test is just to make sure the casting logic doesn't raise an exception utils_tf.clip_by_value(x, clip_min, clip_max)
def body(i, ax, m): logits = self.model.get_logits(ax) loss = self.loss_func(labels=y, logits=logits) if targeted: loss = -loss # Define gradient of loss wrt input grad, = tf.gradients(loss, ax) # Normalize current gradient and add it to the accumulated gradient red_ind = list(xrange(1, len(grad.get_shape()))) avoid_zero_div = tf.cast(1e-12, grad.dtype) grad = grad / tf.maximum( avoid_zero_div, reduce_mean(tf.abs(grad), red_ind, keepdims=True)) m = self.decay_factor * m + grad optimal_perturbation = optimize_linear(m, self.eps_iter, self.ord) if self.ord == 1: raise NotImplementedError( "This attack hasn't been tested for ord=1." "It's not clear that FGM makes a good inner " "loop step for iterative optimization since " "it updates just one coordinate at a time.") # Update and clip adversarial example in current iteration ax = ax + optimal_perturbation ax = x + utils_tf.clip_eta(ax - x, self.ord, self.eps) if self.clip_min is not None and self.clip_max is not None: ax = utils_tf.clip_by_value(ax, self.clip_min, self.clip_max) ax = tf.stop_gradient(ax) return i + 1, ax, m
def body(i, adv_x): """Do a projected gradient step""" labels, _ = self.get_or_guess_labels(adv_x, {y_kwarg: y}) logits = self.model.get_logits(adv_x) adv_x = sparse_l1_descent(adv_x, logits, y=labels, eps=self.eps_iter, q=self.grad_sparsity, clip_min=self.clip_min, clip_max=self.clip_max, clip_grad=self.clip_grad, targeted=(self.y_target is not None), sanity_checks=self.sanity_checks) # Clipping perturbation eta to the l1-ball eta = adv_x - x eta = clip_eta(eta, ord=1, eps=self.eps) adv_x = x + eta # Redo the clipping. # Subtracting and re-adding eta can add some small numerical error. if self.clip_min is not None or self.clip_max is not None: adv_x = utils_tf.clip_by_value(adv_x, self.clip_min, self.clip_max) return i + 1, adv_x
def update_and_clip(ax, perturbation): ax = ax + perturbation ax = x + utils_tf.clip_eta(ax - x, self.ord, self.eps) if self.clip_min is not None and self.clip_max is not None: ax = utils_tf.clip_by_value(ax, self.clip_min, self.clip_max) ax = tf.stop_gradient(ax) return ax
def body(i, ax, m): """Do a momentum step""" if loss_type == 'softmax': logits = self.model.get_logits(ax) early_stop = False if early_stop: # i = tf.cond(tf.less(loss, early_stop_loss_threshold), lambda: self.nb_iter, lambda: i) max_y = tf.argmax(y, axis=-1, name='max_y') max_logits = tf.argmax(logits, axis=-1, name='max_logits') eq = tf.equal(max_y, max_logits) eq = tf.cast(eq, dtype=tf.float32) cnt_eq = tf.reduce_sum(1 - eq) # len_txt = max_y.get_shape().as_list()[1] tot_eq = tf.equal(cnt_eq, 0) i = tf.cond(tot_eq, lambda: self.nb_iter, lambda: i) loss = softmax_cross_entropy_with_logits(labels=y, logits=logits) loss = tf.reduce_mean(loss, name='softmax_loss') elif loss_type == "ctc": time_major_logits, output_seq_len = self.model.get_logits(ax) ctc_loss = tf.nn.ctc_loss(labels=y, inputs=time_major_logits, sequence_length=output_seq_len, time_major=True, ctc_merge_repeated=True, ignore_longer_outputs_than_inputs=True) loss = tf.reduce_mean(ctc_loss, name='ctc_loss') if targeted: loss = -loss # Define gradient of loss wrt input grad, = tf.gradients(loss, ax) # Normalize current gradient and add it to the accumulated gradient red_ind = list(range(1, len(grad.get_shape()))) avoid_zero_div = tf.cast(1e-12, grad.dtype) grad = grad / tf.maximum(avoid_zero_div, tf.reduce_mean(tf.abs(grad), red_ind, keepdims=True)) m = self.decay_factor * m + grad # optimal_perturbation = optimize_linear(m, self.eps_iter, self.ord) optimal_perturbation = optimize_linear_pos(m, self.eps_iter, self.ord, self.pert_type) optimal_perturbation = tf.multiply(optimal_perturbation, self.mask, name="op_multiply") if self.ord == 1: raise NotImplementedError("This attack hasn't been tested for ord=1. It's not clear that FGM makes a good inner loop step " "for iterative optimization since it updates just one coordinate at a time.") # Update and clip adversarial example in current iteration ax = ax + optimal_perturbation ax = x + utils_tf.clip_eta(ax - x, self.ord, self.eps) if self.clip_min is not None and self.clip_max is not None: ax = utils_tf.clip_by_value(ax, self.clip_min, self.clip_max) ax = tf.stop_gradient(ax) return i + 1, ax, m
def _project_perturbation(perturbation, epsilon, input_image, clip_min=None, clip_max=None): """Project `perturbation` onto L-infinity ball of radius `epsilon`. Also project into hypercube such that the resulting adversarial example is between clip_min and clip_max, if applicable. """ clipped_perturbation = utils_tf.clip_by_value(perturbation, -epsilon, epsilon) new_image = input_image + clipped_perturbation return new_image - input_image
def add_noise(x, eps=0.3, clip_min=None, clip_max=None, type='Gaussian'): """ :param x: the input placeholder :param logits: output of model.get_logits :param y: (optional) A placeholder for the model labels. If targeted is true, then provide the target label. Otherwise, only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is None. Labels should be one-hot-encoded. :param eps: the epsilon (input variation parameter) :param ord: (optional) Order of the norm (mimics NumPy). Possible values: np.inf, 1 or 2. :param clip_min: Minimum float value for adversarial example components :param clip_max: Maximum float value for adversarial example components :param targeted: Is the attack targeted or untargeted? Untargeted, the default, will try to make the label incorrect. Targeted will instead try to move in the direction of being more like y. :return: a tensor for the adversarial example """ asserts = [] # If a data range was specified, check that the input was in that range if clip_min is not None: asserts.append( utils_tf.assert_greater_equal(x, tf.cast(clip_min, x.dtype))) if clip_max is not None: asserts.append( utils_tf.assert_less_equal(x, tf.cast(clip_max, x.dtype))) if type == 'Gaussian': perturbation = tf.random.normal(x.shape, mean=0.0, stddev=eps) elif type == 'Uniform': perturbation = tf.random.uniform(x.shape, minval=-eps, maxval=eps) else: print("Unknown noise type") # Add perturbation to original example to obtain adversarial example adv_x = x + perturbation # If clipping is needed, reset all values outside of [clip_min, clip_max] if (clip_min is not None) or (clip_max is not None): # We don't currently support one-sided clipping assert clip_min is not None and clip_max is not None adv_x = utils_tf.clip_by_value(adv_x, clip_min, clip_max) return adv_x
def body(i, adv_x): adv_x = FGM.generate(adv_x, **fgm_params) # Clipping perturbation eta to self.ord norm ball eta = adv_x - x eta = clip_eta(eta, self.ord, self.eps) adv_x = x + eta # Redo the clipping. # FGM already did it, but subtracting and re-adding eta can add some # small numerical error. if self.clip_min is not None or self.clip_max is not None: adv_x = utils_tf.clip_by_value(adv_x, self.clip_min, self.clip_max) return i + 1, adv_x
def fgm_perturb(x, y, loss_fn, clip_min=None, clip_max=None, ord=np.inf, eps=0.3): loss = loss_fn(x) grad, = tf.gradients(loss, x) optimal_perturbation = optimize_linear(grad, eps, ord) adv_x = x + optimal_perturbation if (clip_min is not None) or (clip_max is not None): # We don't currently support one-sided clipping assert clip_min is not None and clip_max is not None adv_x = utils_tf.clip_by_value(adv_x, clip_min, clip_max) return adv_x
def body(i, ax, m): logits = self.model.get_logits(ax) loss = softmax_cross_entropy_with_logits(labels=y, logits=logits) if targeted: loss = -loss # print("body", loss, ax) # Define gradient of loss wrt input grad, = tf.gradients(loss, ax) grad = self.grad_smooth(grad) # Normalize current gradient and add it to the accumulated gradient grad = self.grad_norm(grad) #momentom m = self.decay_factor * m + grad m = self.grad_norm(m) optimal_perturbation = optimize_linear(m, self.eps_iter, self.ord) if self.ord == 1: raise NotImplementedError( "This attack hasn't been tested for ord=1." "It's not clear that FGM makes a good inner " "loop step for iterative optimization since " "it updates just one coordinate at a time.") # Update and clip adversarial example in current iteration ax = ax + optimal_perturbation ax = x + utils_tf.clip_eta(ax - x, self.ord, self.eps) if self.clip_min is not None and self.clip_max is not None: ax = utils_tf.clip_by_value(ax, self.clip_min, self.clip_max) ax = tf.stop_gradient(ax) return i + 1, ax, m
def generate(self, x, **kwargs): """ Generate symbolic graph for adversarial examples and return. :param x: The model's symbolic inputs. :param kwargs: See `parse_params` """ # Parse and save attack-specific parameters assert self.parse_params(**kwargs) asserts = [] # If a data range was specified, check that the input was in that range if self.clip_min is not None: asserts.append(utils_tf.assert_greater_equal(x, tf.cast(self.clip_min, x.dtype))) if self.clip_max is not None: asserts.append(utils_tf.assert_less_equal(x, tf.cast(self.clip_max, x.dtype))) # Initialize loop variables if self.rand_init: eta = tf.random_uniform(tf.shape(x), tf.cast(-self.rand_minmax, x.dtype), tf.cast(self.rand_minmax, x.dtype), dtype=x.dtype) else: eta = tf.zeros(tf.shape(x)) # Clip eta eta = clip_eta(eta, self.ord, self.eps) adv_x = x + eta if self.clip_min is not None or self.clip_max is not None: adv_x = utils_tf.clip_by_value(adv_x, self.clip_min, self.clip_max) if self.y_target is not None: y = self.y_target targeted = True elif self.y is not None: y = self.y targeted = False else: model_preds = self.model.get_probs(x) preds_max = reduce_max(model_preds, 1, keepdims=True) y = tf.to_float(tf.equal(model_preds, preds_max)) y = tf.stop_gradient(y) targeted = False del model_preds y_kwarg = 'y_target' if targeted else 'y' fgm_params = { 'eps': self.eps_iter, y_kwarg: y, 'ord': self.ord, 'clip_min': self.clip_min, 'clip_max': self.clip_max } if self.ord == 1: raise NotImplementedError("It's not clear that FGM is a good inner loop" " step for PGD when ord=1, because ord=1 FGM " " changes only one pixel at a time. We need " " to rigorously test a strong ord=1 PGD " "before enabling this feature.") # Use getattr() to avoid errors in eager execution attacks FGM = self.FGM_CLASS( self.model, sess=getattr(self, 'sess', None), dtypestr=self.dtypestr) def cond(i, _): return tf.less(i, self.nb_iter) def body(i, adv_x): adv_x = FGM.generate(adv_x, **fgm_params) # Clipping perturbation eta to self.ord norm ball eta = adv_x - x eta = clip_eta(eta, self.ord, self.eps) adv_x = x + eta # Redo the clipping. # FGM already did it, but subtracting and re-adding eta can add some # small numerical error. if self.clip_min is not None or self.clip_max is not None: adv_x = utils_tf.clip_by_value(adv_x, self.clip_min, self.clip_max) return i + 1, adv_x _, adv_x = tf.while_loop(cond, body, (tf.zeros([]), adv_x), back_prop=True, maximum_iterations=self.nb_iter) # Asserts run only on CPU. # When multi-GPU eval code tries to force all PGD ops onto GPU, this # can cause an error. common_dtype = tf.float64 asserts.append(utils_tf.assert_less_equal(tf.cast(self.eps_iter, dtype=common_dtype), tf.cast(self.eps, dtype=common_dtype))) if self.ord == np.inf and self.clip_min is not None: # The 1e-6 is needed to compensate for numerical error. # Without the 1e-6 this fails when e.g. eps=.2, clip_min=.5, # clip_max=.7 asserts.append(utils_tf.assert_less_equal(tf.cast(self.eps, x.dtype), 1e-6 + tf.cast(self.clip_max, x.dtype) - tf.cast(self.clip_min, x.dtype))) if self.sanity_checks: with tf.control_dependencies(asserts): adv_x = tf.identity(adv_x) return adv_x
def generate(self, x, **kwargs): """ Generate symbolic graph for adversarial examples and return. :param x: The model's symbolic inputs. :param kwargs: See `parse_params` """ # Parse and save attack-specific parameters assert self.parse_params(**kwargs) asserts = [] # If a data range was specified, check that the input was in that range if self.clip_min is not None: asserts.append( utils_tf.assert_greater_equal(x, tf.cast(self.clip_min, x.dtype))) if self.clip_max is not None: asserts.append( utils_tf.assert_less_equal(x, tf.cast(self.clip_max, x.dtype))) # Initialize loop variables if self.rand_init: eta = random_lp_vector(tf.shape(x), ord=1, eps=tf.cast(self.eps, x.dtype), dtype=x.dtype) else: eta = tf.zeros(tf.shape(x)) # Clip eta eta = clip_eta(eta, ord=1, eps=self.eps) adv_x = x + eta if self.clip_min is not None or self.clip_max is not None: adv_x = utils_tf.clip_by_value(adv_x, self.clip_min, self.clip_max) if self.y_target is not None: y = self.y_target targeted = True elif self.y is not None: y = self.y targeted = False else: model_preds = self.model.get_probs(x) preds_max = tf.reduce_max(model_preds, 1, keepdims=True) y = tf.to_float(tf.equal(model_preds, preds_max)) y = tf.stop_gradient(y) targeted = False del model_preds y_kwarg = 'y_target' if targeted else 'y' def cond(i, _): """Iterate until requested number of iterations is completed""" return tf.less(i, self.nb_iter) def body(i, adv_x): """Do a projected gradient step""" labels, _ = self.get_or_guess_labels(adv_x, {y_kwarg: y}) logits = self.model.get_logits(adv_x) adv_x = sparse_l1_descent(adv_x, logits, y=labels, eps=self.eps_iter, q=self.grad_sparsity, clip_min=self.clip_min, clip_max=self.clip_max, clip_grad=self.clip_grad, targeted=(self.y_target is not None), sanity_checks=self.sanity_checks) # Clipping perturbation eta to the l1-ball eta = adv_x - x eta = clip_eta(eta, ord=1, eps=self.eps) adv_x = x + eta # Redo the clipping. # Subtracting and re-adding eta can add some small numerical error. if self.clip_min is not None or self.clip_max is not None: adv_x = utils_tf.clip_by_value(adv_x, self.clip_min, self.clip_max) return i + 1, adv_x _, adv_x = tf.while_loop(cond, body, (tf.zeros([]), adv_x), back_prop=True, maximum_iterations=self.nb_iter) # Asserts run only on CPU. # When multi-GPU eval code tries to force all PGD ops onto GPU, this # can cause an error. common_dtype = tf.float32 asserts.append( utils_tf.assert_less_equal( tf.cast(self.eps_iter, dtype=common_dtype), tf.cast(self.eps, dtype=common_dtype))) if self.sanity_checks: with tf.control_dependencies(asserts): adv_x = tf.identity(adv_x) return adv_x
def generate(self, x, **kwargs): """ Generate symbolic graph for adversarial examples and return. :param x: The model's symbolic inputs. :param eps: (optional float) maximum distortion of adversarial example compared to original input :param eps_iter: (optional float) step size for each attack iteration :param nb_iter: (optional int) Number of attack iterations. :param rand_init: (optional) Whether to use random initialization :param y: (optional) A tensor with the true class labels NOTE: do not use smoothed labels here :param y_target: (optional) A tensor with the labels to target. Leave y_target=None if y is also set. Labels should be one-hot-encoded. NOTE: do not use smoothed labels here :param ord: (optional) Order of the norm (mimics Numpy). Possible values: np.inf, 1 or 2. :param clip_min: (optional float) Minimum input component value :param clip_max: (optional float) Maximum input component value """ # Parse and save attack-specific parameters assert self.parse_params(**kwargs) # Initialize loop variables if self.rand_init: eta = tf.random_uniform(tf.shape(x), tf.cast(-self.rand_minmax, x.dtype), tf.cast(self.rand_minmax, x.dtype), dtype=x.dtype) else: eta = tf.zeros(tf.shape(x)) # Clip eta eta = clip_eta(eta, self.ord, self.eps) adv_x = x + eta if self.clip_min is not None or self.clip_max is not None: adv_x = utils_tf.clip_by_value(adv_x, self.clip_min, self.clip_max) if self.y_target is not None: y = self.y_target targeted = True elif self.y is not None: y = self.y targeted = False else: model_preds = self.model.get_probs(x) preds_max = reduce_max(model_preds, 1, keepdims=True) y = tf.to_float(tf.equal(model_preds, preds_max)) y = tf.stop_gradient(y) targeted = False del model_preds y_kwarg = 'y_target' if targeted else 'y' fgm_params = { 'eps': self.eps_iter, y_kwarg: y, 'ord': self.ord, 'clip_min': self.clip_min, 'clip_max': self.clip_max, 'loss_func': self.loss_func } if self.ord == 1: raise NotImplementedError( "It's not clear that FGM is a good inner loop" " step for PGD when ord=1, because ord=1 FGM " " changes only one pixel at a time. We need " " to rigorously test a strong ord=1 PGD " "before enabling this feature.") # Use getattr() to avoid errors in eager execution attacks FGM = self.FGM_CLASS(self.model, sess=getattr(self, 'sess', None), dtypestr=self.dtypestr) def cond(i, _): return tf.less(i, self.nb_iter) def body(i, adv_x): #fgm_params['loss_func'] = self.loss_func#(labels=fgm_params['y'], logits=self.model.get_logits(adv_x)) adv_x = FGM.generate(adv_x, **fgm_params) # Clipping perturbation eta to self.ord norm ball eta = adv_x - x eta = clip_eta(eta, self.ord, self.eps) adv_x = x + eta # Redo the clipping. # FGM already did it, but subtracting and re-adding eta can add some # small numerical error. if self.clip_min is not None or self.clip_max is not None: adv_x = utils_tf.clip_by_value(adv_x, self.clip_min, self.clip_max) return i + 1, adv_x _, adv_x = tf.while_loop(cond, body, [tf.zeros([]), adv_x], back_prop=True) asserts = [] # Asserts run only on CPU. # When multi-GPU eval code tries to force all PGD ops onto GPU, this # can cause an error. with tf.device("/CPU:0"): asserts.append(tf.assert_less_equal(self.eps_iter, self.eps)) if self.ord == np.inf and self.clip_min is not None: # The 1e-6 is needed to compensate for numerical error. # Without the 1e-6 this fails when e.g. eps=.2, clip_min=.5, # clip_max=.7 asserts.append( tf.assert_less_equal(self.eps, 1e-6 + self.clip_max - self.clip_min)) if self.sanity_checks: with tf.control_dependencies(asserts): adv_x = tf.identity(adv_x) return adv_x
def generate(self, x, **kwargs): assert self.parse_params(**kwargs) asserts = [] if self.clip_min is not None: asserts.append(utils_tf.assert_greater_equal( x, tf.cast(self.clip_min,x.dtype))) if self.clip_max is not None: asserts.append(utils_tf.assert_less_equal( x, tf.cast(self.clip_max, x.dtype))) m_cache = tf.zeros_like(x) v_cache = tf.zeros_like(x) adv_x = x y, _nb_classes = self.get_or_guess_labels(x, kwargs) y = y / reduce_sum(y, 1, keepdims=True) targeted = (self.y_target is not None) def save_batch(directory, images, labels, iteration, batch_idx): for idx, (image, label) in enumerate(zip(images, labels)): filename = "id{}_b{}_it{}_l{}.png".format(idx, batch_idx, iteration, np.argmax(label)) save_image_np(join(directory, filename), image) for i in range(self.nb_iter): self.logger.debug("Starting #{} iteration".format(i + 1)) logits = self.model.get_logits(adv_x) loss = softmax_cross_entropy_with_logits(labels=y, logits=logits) if targeted: loss = -loss grad, = tf.gradients(loss, adv_x) red_ind = list(range(1, len(grad.get_shape()))) avoid_zero_div = tf.cast(1e-8, grad.dtype) grad = grad / tf.maximum( avoid_zero_div, reduce_mean(tf.abs(grad), red_ind, keepdims=True)) m_cache = self.betha1 * m_cache + (1 - self.betha1) * grad v_cache = self.betha2 * v_cache + (1 - self.betha2) * tf.square(grad) update = tf.divide(m_cache, tf.sqrt(v_cache + avoid_zero_div)) optimal_perturbation = optimize_linear(update, self.eps_iter, self.ord) if self.ord == 1: raise NotImplementedError("This attack hasn't been tested for ord=1." "It's not clear that FGM makes a good inner " "loop step for iterative optimization since " "it updates just one coordinate at a time.") adv_x = adv_x + optimal_perturbation adv_x = x + utils_tf.clip_eta(adv_x - x, self.ord, self.eps) if self.clip_min is not None and self.clip_max is not None: adv_x = utils_tf.clip_by_value(adv_x, self.clip_min, self.clip_max) adv_x = tf.stop_gradient(adv_x) if self.sanity_checks: with tf.control_dependencies(asserts): adv_x = tf.identity(adv_x) with self.sess.as_default(): self.sess.run(self.init_op) for batch in range(self.nb_batches): adv_x_np, y_np = self.sess.run([adv_x, y]) self.logger.debug("Saving attacked batch #{}".format(batch + 1)) save_batch(self.adv_dir, adv_x_np, y_np, i, batch)
def sparse_l1_descent(x, logits, y=None, eps=1.0, q=99, clip_min=None, clip_max=None, clip_grad=False, targeted=False, sanity_checks=True): """ TensorFlow implementation of the Dense L1 Descent Method. :param x: the input placeholder :param logits: output of model.get_logits :param y: (optional) A placeholder for the true labels. If targeted is true, then provide the target label. Otherwise, only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is None. Labels should be one-hot-encoded. :param eps: the epsilon (input variation parameter) :param q: the percentile above which gradient values are retained. Either a scalar or a vector of same length as the input batch dimension. :param clip_min: Minimum float value for adversarial example components :param clip_max: Maximum float value for adversarial example components :param clip_grad: (optional bool) Ignore gradient components at positions where the input is already at the boundary of the domain, and the update step will get clipped out. :param targeted: Is the attack targeted or untargeted? Untargeted, the default, will try to make the label incorrect. Targeted will instead try to move in the direction of being more like y. :return: a tensor for the adversarial example """ asserts = [] # If a data range was specified, check that the input was in that range if clip_min is not None: asserts.append( utils_tf.assert_greater_equal(x, tf.cast(clip_min, x.dtype))) if clip_max is not None: asserts.append( utils_tf.assert_less_equal(x, tf.cast(clip_max, x.dtype))) # Make sure the caller has not passed probs by accident assert logits.op.type != 'Softmax' if y is None: # Using model predictions as ground truth to avoid label leaking preds_max = reduce_max(logits, 1, keepdims=True) y = tf.to_float(tf.equal(logits, preds_max)) y = tf.stop_gradient(y) y = y / reduce_sum(y, 1, keepdims=True) # Compute loss loss = softmax_cross_entropy_with_logits(labels=y, logits=logits) if targeted: loss = -loss # Define gradient of loss wrt input grad, = tf.gradients(loss, x) if clip_grad: grad = utils_tf.zero_out_clipped_grads(grad, x, clip_min, clip_max) red_ind = list(range(1, len(grad.get_shape()))) dim = tf.reduce_prod(tf.shape(x)[1:]) abs_grad = tf.reshape(tf.abs(grad), (-1, dim)) # if q is a scalar, broadcast it to a vector of same length as the batch dim q = tf.cast(tf.broadcast_to(q, tf.shape(x)[0:1]), tf.float32) k = tf.cast(tf.floor(q / 100 * tf.cast(dim, tf.float32)), tf.int32) # `tf.sort` is much faster than `tf.contrib.distributions.percentile`. # For TF <= 1.12, use `tf.nn.top_k` as `tf.sort` is not implemented. if LooseVersion(tf.__version__) <= LooseVersion('1.12.0'): # `tf.sort` is only available in TF 1.13 onwards sorted_grad = -tf.nn.top_k(-abs_grad, k=dim, sorted=True)[0] else: sorted_grad = tf.sort(abs_grad, axis=-1) idx = tf.stack((tf.range(tf.shape(abs_grad)[0]), k), -1) percentiles = tf.gather_nd(sorted_grad, idx) tied_for_max = tf.greater_equal(abs_grad, tf.expand_dims(percentiles, -1)) tied_for_max = tf.reshape(tf.cast(tied_for_max, x.dtype), tf.shape(grad)) num_ties = tf.reduce_sum(tied_for_max, red_ind, keepdims=True) optimal_perturbation = tf.sign(grad) * tied_for_max / num_ties # Add perturbation to original example to obtain adversarial example adv_x = x + utils_tf.mul(eps, optimal_perturbation) # If clipping is needed, reset all values outside of [clip_min, clip_max] if (clip_min is not None) or (clip_max is not None): # We don't currently support one-sided clipping assert clip_min is not None and clip_max is not None adv_x = utils_tf.clip_by_value(adv_x, clip_min, clip_max) if sanity_checks: with tf.control_dependencies(asserts): adv_x = tf.identity(adv_x) return adv_x
def fgm(x, logits, y=None, eps=0.3, ord=np.inf, clip_min=None, clip_max=None, targeted=False, sanity_checks=True): """ TensorFlow implementation of the Fast Gradient Method. :param x: the input placeholder :param logits: output of model.get_logits :param y: (optional) A placeholder for the true labels. If targeted is true, then provide the target label. Otherwise, only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is None. Labels should be one-hot-encoded. :param eps: the epsilon (input variation parameter) :param ord: (optional) Order of the norm (mimics NumPy). Possible values: np.inf, 1 or 2. :param clip_min: Minimum float value for adversarial example components :param clip_max: Maximum float value for adversarial example components :param targeted: Is the attack targeted or untargeted? Untargeted, the default, will try to make the label incorrect. Targeted will instead try to move in the direction of being more like y. :return: a tensor for the adversarial example """ asserts = [] # If a data range was specified, check that the input was in that range if clip_min is not None: asserts.append( utils_tf.assert_greater_equal(x, tf.cast(clip_min, x.dtype))) if clip_max is not None: asserts.append( utils_tf.assert_less_equal(x, tf.cast(clip_max, x.dtype))) # Make sure the caller has not passed probs by accident assert logits.op.type != 'Softmax' if y is None: # Using model predictions as ground truth to avoid label leaking preds_max = reduce_max(logits, 1, keepdims=True) y = tf.to_float(tf.equal(logits, preds_max)) y = tf.stop_gradient(y) y = y / reduce_sum(y, 1, keepdims=True) # Compute loss loss = softmax_cross_entropy_with_logits(labels=y, logits=logits) if targeted: loss = -loss # Define gradient of loss wrt input grad, = tf.gradients(loss, x) optimal_perturbation = optimize_linear(grad, eps, ord) # Add perturbation to original example to obtain adversarial example adv_x = x + optimal_perturbation # If clipping is needed, reset all values outside of [clip_min, clip_max] if (clip_min is not None) or (clip_max is not None): # We don't currently support one-sided clipping assert clip_min is not None and clip_max is not None adv_x = utils_tf.clip_by_value(adv_x, clip_min, clip_max) if sanity_checks: with tf.control_dependencies(asserts): adv_x = tf.identity(adv_x) return adv_x
def pgd_perturb(x, y, loss_fn, y_target=None, clip_min=None, clip_max=None, rand_init=False, ord=np.inf, eps=0.3, eps_iter=0.1, rand_minmax=0.3, nb_iter=20): # changed nb_iter to 20 and eps_iter to 0.1 for higher eps attack # Initialize loop variables if rand_init: eta = tf.random_uniform(tf.shape(x), tf.cast(-rand_minmax, x.dtype), tf.cast(rand_minmax, x.dtype), dtype=x.dtype) else: eta = tf.zeros(tf.shape(x)) # Clip eta eta = clip_eta(eta, ord, eps) adv_x = x + eta if clip_min is not None or clip_max is not None: adv_x = utils_tf.clip_by_value(adv_x, clip_min, clip_max) if y_target is not None: y = y_target targeted = True elif y is not None: y = y targeted = False else: raise ValueError # model_preds = self.model.get_probs(x) # preds_max = reduce_max(model_preds, 1, keepdims=True) # y = tf.to_float(tf.equal(model_preds, preds_max)) # y = tf.stop_gradient(y) # targeted = False # del model_preds y_kwarg = 'y_target' if targeted else 'y' fgm_params = { 'loss_fn': loss_fn, 'eps': eps_iter, y_kwarg: y, 'ord': ord, 'clip_min': clip_min, 'clip_max': clip_max } if ord == 1: raise NotImplementedError( "It's not clear that FGM is a good inner loop" " step for PGD when ord=1, because ord=1 FGM " " changes only one pixel at a time. We need " " to rigorously test a strong ord=1 PGD " "before enabling this feature.") # Use getattr() to avoid errors in eager execution attacks #FGM = self.FGM_CLASS( # self.model, # sess=getattr(self, 'sess', None), # dtypestr=self.dtypestr) def cond(i, _): return tf.less(i, nb_iter) def body(i, adv_x): adv_x = fgm_perturb(adv_x, **fgm_params) # Clipping perturbation eta to self.ord norm ball eta = adv_x - x eta = clip_eta(eta, ord, eps) adv_x = x + eta # Redo the clipping. # FGM already did it, but subtracting and re-adding eta can add some # small numerical error. if clip_min is not None or clip_max is not None: adv_x = utils_tf.clip_by_value(adv_x, clip_min, clip_max) return i + 1, adv_x _, adv_x = tf.while_loop(cond, body, (tf.zeros([]), adv_x), back_prop=True, maximum_iterations=nb_iter) #if self.sanity_checks: # with tf.control_dependencies(asserts): # adv_x = tf.identity(adv_x) return adv_x
def generate(self, x, **kwargs): """ Generate symbolic graph for adversarial examples and return. :param x: The model's symbolic inputs. :param kwargs: See `parse_params` """ # Parse and save attack-specific parameters assert self.parse_params(**kwargs) asserts = [] # If a data range was specified, check that the input was in that range if self.clip_min is not None: asserts.append( utils_tf.assert_greater_equal(x, tf.cast(self.clip_min, x.dtype))) if self.clip_max is not None: asserts.append( utils_tf.assert_less_equal(x, tf.cast(self.clip_max, x.dtype))) # Initialize loop variables if self.rand_init: eta = random_lp_vector( tf.shape(x), self.ord, tf.cast(self.rand_init_eps, x.dtype), dtype=x.dtype, ) else: eta = tf.zeros(tf.shape(x)) # Clip eta eta = clip_eta(eta, self.ord, self.eps) adv_x = x + eta if self.clip_min is not None or self.clip_max is not None: adv_x = utils_tf.clip_by_value(adv_x, self.clip_min, self.clip_max) if self.y_target is not None: y = self.y_target targeted = True elif self.y is not None: y = self.y targeted = False else: model_preds = self.model.get_probs(x) preds_max = tf.reduce_max(model_preds, 1, keepdims=True) y = tf.to_float(tf.equal(model_preds, preds_max)) y = tf.stop_gradient(y) targeted = False del model_preds y_kwarg = "y_target" if targeted else "y" fgm_params = { "eps": self.eps_iter, y_kwarg: y, "ord": self.ord, "loss_fn": self.loss_fn, "clip_min": self.clip_min, "clip_max": self.clip_max, "clip_grad": self.clip_grad, } if self.ord == 1: raise NotImplementedError( "FGM is not a good inner loop step for PGD " " when ord=1, because ord=1 FGM changes only " " one pixel at a time. Use the SparseL1Descent " " attack instead, which allows fine-grained " " control over the sparsity of the gradient " " updates.") # Use getattr() to avoid errors in eager execution attacks FGM = self.FGM_CLASS(self.model, sess=getattr(self, "sess", None), dtypestr=self.dtypestr) def cond(i, _): """Iterate until requested number of iterations is completed""" return tf.less(i, self.nb_iter) def body(i, adv_x): """Do a projected gradient step""" adv_x = FGM.generate(adv_x, **fgm_params) # Clipping perturbation eta to self.ord norm ball eta = adv_x - x eta = clip_eta(eta, self.ord, self.eps) adv_x = x + eta # Redo the clipping. # FGM already did it, but subtracting and re-adding eta can add some # small numerical error. if self.clip_min is not None or self.clip_max is not None: adv_x = utils_tf.clip_by_value(adv_x, self.clip_min, self.clip_max) return i + 1, adv_x _, adv_x = tf.while_loop( cond, body, (tf.zeros([]), adv_x), back_prop=True, maximum_iterations=self.nb_iter, ) # Asserts run only on CPU. # When multi-GPU eval code tries to force all PGD ops onto GPU, this # can cause an error. common_dtype = tf.float32 asserts.append( utils_tf.assert_less_equal( tf.cast(self.eps_iter, dtype=common_dtype), tf.cast(self.eps, dtype=common_dtype), )) if self.ord == np.inf and self.clip_min is not None: # The 1e-6 is needed to compensate for numerical error. # Without the 1e-6 this fails when e.g. eps=.2, clip_min=.5, # clip_max=.7 asserts.append( utils_tf.assert_less_equal( tf.cast(self.eps, x.dtype), 1e-6 + tf.cast(self.clip_max, x.dtype) - tf.cast(self.clip_min, x.dtype), )) if self.sanity_checks: with tf.control_dependencies(asserts): adv_x = tf.identity(adv_x) return adv_x