def fprop(self, x, **kwargs): output = self.callable_fn(x, **kwargs) # Do some sanity checking to reduce the chance that probs are used # as logits accidentally or vice versa if self.output_layer == 'probs': assert output.op.type == "Softmax" min_prob = tf.reduce_min(output) max_prob = tf.reduce_max(output) asserts = [utils_tf.assert_greater_equal(min_prob, tf.cast(0., min_prob.dtype)), utils_tf.assert_less_equal(max_prob, tf.cast(1., max_prob.dtype))] with tf.control_dependencies(asserts): output = tf.identity(output) elif self.output_layer == 'logits': assert output.op.type != 'Softmax' return {self.output_layer: output}
def get_probs(self, x, **kwargs): """ :param x: A symbolic representation (Tensor) of the network input :return: A symbolic representation (Tensor) of the output probabilities (i.e., the output values produced by the softmax layer). """ d = self.fprop(x, **kwargs) if self.O_PROBS in d: output = d[self.O_PROBS] min_prob = tf.reduce_min(output) max_prob = tf.reduce_max(output) asserts = [utils_tf.assert_greater_equal(min_prob, tf.cast(0., min_prob.dtype)), utils_tf.assert_less_equal(max_prob, tf.cast(1., min_prob.dtype))] with tf.control_dependencies(asserts): output = tf.identity(output) return output elif self.O_LOGITS in d: return tf.nn.softmax(logits=d[self.O_LOGITS]) else: raise ValueError('Cannot find probs or logits.')
def _project_perturbation(perturbation, epsilon, input_image, clip_min=None, clip_max=None): """Project `perturbation` onto L-infinity ball of radius `epsilon`. Also project into hypercube such that the resulting adversarial example is between clip_min and clip_max, if applicable. """ if clip_min is None or clip_max is None: raise NotImplementedError("_project_perturbation currently has clipping " "hard-coded in.") # Ensure inputs are in the correct range with tf.control_dependencies([ utils_tf.assert_less_equal(input_image, tf.cast(clip_max, input_image.dtype)), utils_tf.assert_greater_equal(input_image, tf.cast(clip_min, input_image.dtype)) ]): clipped_perturbation = utils_tf.clip_by_value( perturbation, -epsilon, epsilon) new_image = utils_tf.clip_by_value( input_image + clipped_perturbation, clip_min, clip_max) return new_image - input_image
def fgm(x, logits, y=None, eps=0.3, ord=np.inf, clip_min=None, clip_max=None, targeted=False, sanity_checks=True): """ TensorFlow implementation of the Fast Gradient Method. :param x: the input placeholder :param logits: output of model.get_logits :param y: (optional) A placeholder for the true labels. If targeted is true, then provide the target label. Otherwise, only provide this parameter if you'd like to use true labels when crafting adversarial samples. Otherwise, model predictions are used as labels to avoid the "label leaking" effect (explained in this paper: https://arxiv.org/abs/1611.01236). Default is None. Labels should be one-hot-encoded. :param eps: the epsilon (input variation parameter) :param ord: (optional) Order of the norm (mimics NumPy). Possible values: np.inf, 1 or 2. :param clip_min: Minimum float value for adversarial example components :param clip_max: Maximum float value for adversarial example components :param targeted: Is the attack targeted or untargeted? Untargeted, the default, will try to make the label incorrect. Targeted will instead try to move in the direction of being more like y. :return: a tensor for the adversarial example """ asserts = [] # If a data range was specified, check that the input was in that range if clip_min is not None: asserts.append( utils_tf.assert_greater_equal(x, tf.cast(clip_min, x.dtype))) if clip_max is not None: asserts.append( utils_tf.assert_less_equal(x, tf.cast(clip_max, x.dtype))) # Make sure the caller has not passed probs by accident assert logits.op.type != 'Softmax' if y is None: # Using model predictions as ground truth to avoid label leaking preds_max = reduce_max(logits, 1, keepdims=True) y = tf.to_float(tf.equal(logits, preds_max)) y = tf.stop_gradient(y) y = y / reduce_sum(y, 1, keepdims=True) # Compute loss loss = softmax_cross_entropy_with_logits(labels=y, logits=logits) if targeted: loss = -loss # Define gradient of loss wrt input grad, = tf.gradients(loss, x) optimal_perturbation = optimize_linear(grad, eps, ord) # Add perturbation to original example to obtain adversarial example adv_x = x + optimal_perturbation # If clipping is needed, reset all values outside of [clip_min, clip_max] if (clip_min is not None) or (clip_max is not None): # We don't currently support one-sided clipping assert clip_min is not None and clip_max is not None adv_x = utils_tf.clip_by_value(adv_x, clip_min, clip_max) if sanity_checks: with tf.control_dependencies(asserts): adv_x = tf.identity(adv_x) return adv_x
def generate(self, x, **kwargs): """ Generate symbolic graph for adversarial examples and return. :param x: The model's symbolic inputs. :param kwargs: Keyword arguments. See `parse_params` for documentation. """ # Parse and save attack-specific parameters assert self.parse_params(**kwargs) asserts = [] # If a data range was specified, check that the input was in that range if self.clip_min is not None: asserts.append( utils_tf.assert_greater_equal(x, tf.cast(self.clip_min, x.dtype))) if self.clip_max is not None: asserts.append( utils_tf.assert_less_equal(x, tf.cast(self.clip_max, x.dtype))) # Initialize loop variables momentum = tf.zeros_like(x) adv_x = x # Fix labels to the first model predictions for loss computation y, _nb_classes = self.get_or_guess_labels(x, kwargs) y = y / reduce_sum(y, 1, keepdims=True) targeted = (self.y_target is not None) def cond(i, _, __): return tf.less(i, self.nb_iter) def body(i, ax, m): logits = self.model.get_logits(ax) loss = softmax_cross_entropy_with_logits(labels=y, logits=logits) if targeted: loss = -loss # Define gradient of loss wrt input grad, = tf.gradients(loss, ax) # Normalize current gradient and add it to the accumulated gradient red_ind = list(range(1, len(grad.get_shape()))) avoid_zero_div = tf.cast(1e-12, grad.dtype) grad = grad / tf.maximum( avoid_zero_div, reduce_mean(tf.abs(grad), red_ind, keepdims=True)) m = self.decay_factor * m + grad optimal_perturbation = optimize_linear(m, self.eps_iter, self.ord) if self.ord == 1: raise NotImplementedError( "This attack hasn't been tested for ord=1." "It's not clear that FGM makes a good inner " "loop step for iterative optimization since " "it updates just one coordinate at a time.") # Update and clip adversarial example in current iteration ax = ax + optimal_perturbation ax = x + utils_tf.clip_eta(ax - x, self.ord, self.eps) if self.clip_min is not None and self.clip_max is not None: ax = utils_tf.clip_by_value(ax, self.clip_min, self.clip_max) ax = tf.stop_gradient(ax) return i + 1, ax, m _, adv_x, _ = tf.while_loop(cond, body, (tf.zeros([]), adv_x, momentum), back_prop=True, maximum_iterations=self.nb_iter) if self.sanity_checks: with tf.control_dependencies(asserts): adv_x = tf.identity(adv_x) return adv_x
def projected_optimization(loss_fn, input_image, label, epsilon, num_steps, clip_min=None, clip_max=None, optimizer=TensorAdam(), project_perturbation=_project_perturbation, early_stop_loss_threshold=None, is_debug=False): """Generic projected optimization, generalized to work with approximate gradients. Used for e.g. the SPSA attack. Args: :param loss_fn: A callable which takes `input_image` and `label` as arguments, and returns a batch of loss values. Same interface as TensorOptimizer. :param input_image: Tensor, a batch of images :param label: Tensor, a batch of labels :param epsilon: float, the L-infinity norm of the maximum allowable perturbation :param num_steps: int, the number of steps of gradient descent :param clip_min: float, minimum pixel value :param clip_max: float, maximum pixel value :param optimizer: A `TensorOptimizer` object :param project_perturbation: A function, which will be used to enforce some constraint. It should have the same signature as `_project_perturbation`. :param early_stop_loss_threshold: A float or None. If specified, the attack will end if the loss is below `early_stop_loss_threshold`. Enabling this option can have several different effects: - Setting the threshold to 0. guarantees that if a successful attack is found, it is returned. This increases the attack success rate, because without early stopping the optimizer can accidentally bounce back to a point where the attack fails. - Early stopping can make the attack run faster because it may run for fewer steps. - Early stopping can make the attack run slower because the loss must be calculated at each step. The loss is not calculated as part of the normal SPSA optimization procedure. For most reasonable choices of hyperparameters, early stopping makes the attack much faster because it decreases the number of steps dramatically. :param is_debug: A bool. If True, print debug info for attack progress. Returns: adversarial version of `input_image`, with L-infinity difference less than epsilon, which tries to minimize loss_fn. Note that this function is not intended as an Attack by itself. Rather, it is designed as a helper function which you can use to write your own attack methods. The method uses a tf.while_loop to optimize a loss function in a single sess.run() call. """ assert num_steps is not None if is_debug: with tf.device("/cpu:0"): input_image = tf.Print( input_image, [], "Starting PGD attack with epsilon: %s" % epsilon) init_perturbation = tf.random_uniform( tf.shape(input_image), minval=tf.cast(-epsilon, input_image.dtype), maxval=tf.cast(epsilon, input_image.dtype), dtype=input_image.dtype) init_perturbation = project_perturbation(init_perturbation, epsilon, input_image, clip_min=clip_min, clip_max=clip_max) init_optim_state = optimizer.init_state([init_perturbation]) nest = tf.contrib.framework.nest def loop_body(i, perturbation, flat_optim_state): """Update perturbation to input image.""" optim_state = nest.pack_sequence_as( structure=init_optim_state, flat_sequence=flat_optim_state) def wrapped_loss_fn(x): return loss_fn(input_image + x, label) new_perturbation_list, new_optim_state = optimizer.minimize( wrapped_loss_fn, [perturbation], optim_state) projected_perturbation = project_perturbation(new_perturbation_list[0], epsilon, input_image, clip_min=clip_min, clip_max=clip_max) # Be careful with this bool. A value of 0. is a valid threshold but evaluates to False, so we must explicitly # check whether the value is None. early_stop = early_stop_loss_threshold is not None compute_loss = is_debug or early_stop # Don't waste time building the loss graph if we're not going to use it if compute_loss: # NOTE: this step is not actually redundant with the optimizer step. # SPSA calculates the loss at randomly perturbed points but doesn't calculate the loss at the current point. loss = reduce_mean(wrapped_loss_fn(projected_perturbation), axis=0) if is_debug: with tf.device("/cpu:0"): loss = tf.Print(loss, [loss], "Total batch loss") if early_stop: i = tf.cond(tf.less(loss, early_stop_loss_threshold), lambda: float(num_steps), lambda: i) return i + 1, projected_perturbation, nest.flatten(new_optim_state) def cond(i, *_): return tf.less(i, num_steps) flat_init_optim_state = nest.flatten(init_optim_state) _, final_perturbation, _ = tf.while_loop( cond, loop_body, loop_vars=(tf.constant(0.), init_perturbation, flat_init_optim_state), parallel_iterations=1, back_prop=False, maximum_iterations=num_steps) if project_perturbation is _project_perturbation: # TODO: this assert looks totally wrong. # Not bothering to fix it now because it's only an assert. # 1) Multiplying by 1.1 gives a huge margin of error. This should probably # take the difference and allow a tolerance of 1e-6 or something like # that. # 2) I think it should probably check the *absolute value* of # final_perturbation perturbation_max = epsilon * 1.1 check_diff = utils_tf.assert_less_equal( final_perturbation, tf.cast(perturbation_max, final_perturbation.dtype), message="final_perturbation must change no pixel by more than " "%s" % perturbation_max) else: # TODO: let caller pass in a check_diff function as well as # project_perturbation check_diff = tf.no_op() if clip_min is None or clip_max is None: raise NotImplementedError("This function only supports clipping for now") check_range = [utils_tf.assert_less_equal(input_image, tf.cast(clip_max, input_image.dtype)), utils_tf.assert_greater_equal(input_image, tf.cast(clip_min, input_image.dtype))] with tf.control_dependencies([check_diff] + check_range): adversarial_image = input_image + final_perturbation return tf.stop_gradient(adversarial_image)
def generate(self, x, **kwargs): """ Generate symbolic graph for adversarial examples and return. :param x: The model's symbolic inputs. :param kwargs: See `parse_params` """ # Parse and save attack-specific parameters assert self.parse_params(**kwargs) asserts = [] # If a data range was specified, check that the input was in that range if self.clip_min is not None: asserts.append( utils_tf.assert_greater_equal(x, tf.cast(self.clip_min, x.dtype))) if self.clip_max is not None: asserts.append( utils_tf.assert_less_equal(x, tf.cast(self.clip_max, x.dtype))) # Initialize loop variables if self.rand_init: eta = tf.random_uniform(tf.shape(x), tf.cast(-self.rand_minmax, x.dtype), tf.cast(self.rand_minmax, x.dtype), dtype=x.dtype) else: eta = tf.zeros(tf.shape(x)) # Clip eta eta = clip_eta(eta, self.ord, self.eps) adv_x = x + eta if self.clip_min is not None or self.clip_max is not None: adv_x = utils_tf.clip_by_value(adv_x, self.clip_min, self.clip_max) if self.y_target is not None: y = self.y_target targeted = True elif self.y is not None: y = self.y targeted = False else: model_preds = self.model.get_probs(x) preds_max = reduce_max(model_preds, 1, keepdims=True) y = tf.to_float(tf.equal(model_preds, preds_max)) y = tf.stop_gradient(y) targeted = False del model_preds y_kwarg = 'y_target' if targeted else 'y' fgm_params = { 'eps': self.eps_iter, y_kwarg: y, 'ord': self.ord, 'clip_min': self.clip_min, 'clip_max': self.clip_max } if self.ord == 1: raise NotImplementedError( "It's not clear that FGM is a good inner loop" " step for PGD when ord=1, because ord=1 FGM " " changes only one pixel at a time. We need " " to rigorously test a strong ord=1 PGD " "before enabling this feature.") # Use getattr() to avoid errors in eager execution attacks FGM = self.FGM_CLASS(self.model, sess=getattr(self, 'sess', None), dtypestr=self.dtypestr) def cond(i, _): return tf.less(i, self.nb_iter) def body(i, adv_x): adv_x = FGM.generate(adv_x, **fgm_params) # Clipping perturbation eta to self.ord norm ball eta = adv_x - x eta = clip_eta(eta, self.ord, self.eps) adv_x = x + eta # Redo the clipping. # FGM already did it, but subtracting and re-adding eta can add some # small numerical error. if self.clip_min is not None or self.clip_max is not None: adv_x = utils_tf.clip_by_value(adv_x, self.clip_min, self.clip_max) return i + 1, adv_x _, adv_x = tf.while_loop(cond, body, (tf.zeros([]), adv_x), back_prop=True, maximum_iterations=self.nb_iter) # Asserts run only on CPU. # When multi-GPU eval code tries to force all PGD ops onto GPU, this # can cause an error. common_dtype = tf.float64 asserts.append( utils_tf.assert_less_equal( tf.cast(self.eps_iter, dtype=common_dtype), tf.cast(self.eps, dtype=common_dtype))) if self.ord == np.inf and self.clip_min is not None: # The 1e-6 is needed to compensate for numerical error. # Without the 1e-6 this fails when e.g. eps=.2, clip_min=.5, # clip_max=.7 asserts.append( utils_tf.assert_less_equal( tf.cast(self.eps, x.dtype), 1e-6 + tf.cast(self.clip_max, x.dtype) - tf.cast(self.clip_min, x.dtype))) if self.sanity_checks: with tf.control_dependencies(asserts): adv_x = tf.identity(adv_x) return adv_x