def __init__(self, model, batch_size, loss, goal, distance_metric, session): ''' Initialize FGSM. :param model: The model to attack. A `realsafe.model.Classifier` instance. :param batch_size: Batch size for the `batch_attack()` method. :param loss: The loss function to optimize. A `realsafe.loss.Loss` instance. :param goal: Adversarial goals. All supported values are 't', 'tm', and 'ut'. :param distance_metric: Adversarial distance metric. All supported values are 'l_2' and 'l_inf'. :param session: The `tf.Session` to run the attack in. The `model` should be loaded into this session. ''' self.model, self.batch_size, self._session = model, batch_size, session self.loss, self.goal, self.distance_metric = loss, goal, distance_metric # placeholder for batch_attack's input self.xs_ph = get_xs_ph(model, batch_size) self.ys_ph = get_ys_ph(model, batch_size) # magnitude self.eps_ph = tf.placeholder(self.model.x_dtype, (self.batch_size, )) self.eps_var = tf.Variable( tf.zeros((self.batch_size, ), dtype=self.model.x_dtype)) # calculate loss' gradient with relate to the input grad = tf.gradients(self.loss(self.xs_ph, self.ys_ph), self.xs_ph)[0] if goal == 't' or goal == 'tm': grad = -grad elif goal != 'ut': raise NotImplementedError # flatten the gradient for easier broadcast operations grad_flatten = tf.reshape(grad, (batch_size, -1)) # calculate update if distance_metric == 'l_2': grad_unit = get_unit(grad_flatten) update = tf.expand_dims(self.eps_var, 1) * grad_unit elif distance_metric == 'l_inf': update = tf.expand_dims(self.eps_var, 1) * tf.sign(grad_flatten) else: raise NotImplementedError update = tf.reshape(update, (self.batch_size, *self.model.x_shape)) # clip by (x_min, x_max) self.xs_adv = tf.clip_by_value(self.xs_ph + update, self.model.x_min, self.model.x_max) self.config_eps_step = self.eps_var.assign(self.eps_ph)
def __init__(self, model, batch_size, loss, goal, distance_metric, session, iteration_callback=None): ''' Initialize BIM. :param model: The model to attack. A ``realsafe.model.Classifier`` instance. :param batch_size: Batch size for the ``batch_attack()`` method. :param loss: The loss function to optimize. A ``realsafe.loss.Loss`` instance. :param goal: Adversarial goals. All supported values are ``'t'``, ``'tm'``, and ``'ut'``. :param distance_metric: Adversarial distance metric. All supported values are ``'l_2'`` and ``'l_inf'``. :param session: The ``tf.Session`` to run the attack in. The ``model`` should be loaded into this session. :param iteration_callback: A function accept a ``xs`` ``tf.Tensor`` (the original examples) and a ``xs_adv`` ``tf.Tensor`` (the adversarial examples for ``xs``). During ``batch_attack()``, this callback function would be runned after each iteration, and its return value would be yielded back to the caller. By default, ``iteration_callback`` is ``None``. ''' self.model, self.batch_size, self._session = model, batch_size, session self.loss, self.goal, self.distance_metric = loss, goal, distance_metric # placeholder for batch_attack's input self.xs_ph = get_xs_ph(model, batch_size) self.ys_ph = get_ys_ph(model, batch_size) # flatten shape of xs_ph xs_flatten_shape = (batch_size, np.prod(self.model.x_shape)) # store xs and ys in variables to reduce memory copy between tensorflow and python # variable for the original example with shape of (batch_size, D) self.xs_var = tf.Variable(tf.zeros(shape=xs_flatten_shape, dtype=self.model.x_dtype)) # variable for labels self.ys_var = tf.Variable(tf.zeros(shape=(batch_size,), dtype=self.model.y_dtype)) # variable for the (hopefully) adversarial example with shape of (batch_size, D) self.xs_adv_var = tf.Variable(tf.zeros(shape=xs_flatten_shape, dtype=self.model.x_dtype)) # magnitude self.eps_ph = tf.placeholder(self.model.x_dtype, (self.batch_size,)) self.eps_var = tf.Variable(tf.zeros((self.batch_size,), dtype=self.model.x_dtype)) # step size self.alpha_ph = tf.placeholder(self.model.x_dtype, (self.batch_size,)) self.alpha_var = tf.Variable(tf.zeros((self.batch_size,), dtype=self.model.x_dtype)) # expand dim for easier broadcast operations eps = tf.expand_dims(self.eps_var, 1) alpha = tf.expand_dims(self.alpha_var, 1) # calculate loss' gradient with relate to the adversarial example # grad.shape == (batch_size, D) self.xs_adv_model = tf.reshape(self.xs_adv_var, (batch_size, *self.model.x_shape)) self.loss = loss(self.xs_adv_model, self.ys_var) grad = tf.gradients(self.loss, self.xs_adv_var)[0] if goal == 't' or goal == 'tm': grad = -grad elif goal != 'ut': raise NotImplementedError # update the adversarial example if distance_metric == 'l_2': grad_unit = get_unit(grad) xs_adv_delta = self.xs_adv_var - self.xs_var + alpha * grad_unit # clip by max l_2 magnitude of adversarial noise xs_adv_next = self.xs_var + tf.clip_by_norm(xs_adv_delta, eps, axes=[1]) elif distance_metric == 'l_inf': xs_lo, xs_hi = self.xs_var - eps, self.xs_var + eps grad_sign = tf.sign(grad) # clip by max l_inf magnitude of adversarial noise xs_adv_next = tf.clip_by_value(self.xs_adv_var + alpha * grad_sign, xs_lo, xs_hi) else: raise NotImplementedError # clip by (x_min, x_max) xs_adv_next = tf.clip_by_value(xs_adv_next, self.model.x_min, self.model.x_max) self.update_xs_adv_step = self.xs_adv_var.assign(xs_adv_next) self.config_eps_step = self.eps_var.assign(self.eps_ph) self.config_alpha_step = self.alpha_var.assign(self.alpha_ph) self.setup_xs = [self.xs_var.assign(tf.reshape(self.xs_ph, xs_flatten_shape)), self.xs_adv_var.assign(tf.reshape(self.xs_ph, xs_flatten_shape))] self.setup_ys = self.ys_var.assign(self.ys_ph) self.iteration = None self.iteration_callback = None if iteration_callback is not None: xs_model = tf.reshape(self.xs_var, (self.batch_size, *self.model.x_shape)) self.iteration_callback = iteration_callback(xs_model, self.xs_adv_model)
def __init__(self, model, batch_size, distance_metric, session, iteration_callback=None): ''' Initialize DeepFool. :param model: The model to attack. A `realsafe.model.ClassifierWithLogits` instance. :param batch_size: Batch size for the `batch_attack()` method. :param distance_metric: Adversarial distance metric. All supported values are 'l_2' and 'l_inf'. :param session: The `tf.Session` to run the attack in. The `model` should be loaded into this session. :param iteration_callback: A function accept a `xs` `tf.Tensor` (the original examples) and a `xs_adv` `tf.Tensor` (the adversarial examples for `xs`). During `batch_attack()`, this callback function would be runned after each iteration, and its return value would be yielded back to the caller. By default, `iteration_callback` is `None`. ''' self.model, self.batch_size, self._session = model, batch_size, session self.overshot = tf.Variable(0.02) self.overshot_ph = tf.placeholder(tf.float32) # placeholder for batch_attack's input self.xs_ph = get_xs_ph(model, batch_size) self.ys_ph = get_ys_ph(model, batch_size) # store xs, xs_adv and ys in variables to reduce memory copy between tensorflow and python # flatten shape of xs_ph xs_flatten_shape = (batch_size, np.prod(self.model.x_shape)) # variable for the original example with shape of (batch_size, D) self.xs_var = tf.Variable(tf.zeros(shape=xs_flatten_shape, dtype=self.model.x_dtype)) # variable for labels self.ys_var = tf.Variable(tf.zeros(shape=(batch_size,), dtype=self.model.y_dtype)) # variable for the (hopefully) adversarial example with shape of (batch_size, D) self.xs_adv_var = tf.Variable(tf.zeros(shape=xs_flatten_shape, dtype=self.model.x_dtype)) # get the adversarial example's logits and labels logits, self.labels = self.model.logits_and_labels( xs=tf.reshape(self.xs_adv_var, (batch_size,) + self.model.x_shape)) # we need to calculate the jacobian step by step self.grads_var = tf.Variable(tf.zeros((self.batch_size, self.model.n_class, np.prod(self.model.x_shape)), dtype=self.model.x_dtype)) # calculating jacobian would construct a large graph self.assign_grads = [self.grads_var[:, i, :].assign(tf.gradients(logits[:, i], self.xs_adv_var)[0]) for i in range(self.model.n_class)] # get the target label's logits and jacobian k0s = tf.stack((tf.range(self.batch_size), self.ys_var), axis=1) yk0s = tf.expand_dims(tf.gather_nd(logits, k0s), axis=1) gradk0s = tf.expand_dims(tf.gather_nd(self.grads_var, k0s), axis=1) fs = tf.abs(yk0s - logits) ws = self.grads_var - gradk0s ws_norm = tf.norm(ws, axis=-1) # for index = k0, ws_norm = 0.0, fs = 0.0, ls = 0.0 / 0.0 = NaN, and tf.argmin would ignore NaN ls = fs / ws_norm ks = tf.argmin(ls, axis=1, output_type=self.model.y_dtype) ks = tf.stack((tf.range(self.batch_size), ks), axis=1) fsks = tf.gather_nd(fs, ks) ws_normks = tf.gather_nd(ws_norm, ks) if distance_metric == 'l_2': wsks = tf.gather_nd(ws, ks) rs = tf.reshape(fsks / tf.square(ws_normks), (self.batch_size, 1)) * wsks elif distance_metric == 'l_inf': ws_sign_ks = tf.gather_nd(tf.sign(ws), ks) rs = tf.reshape(fsks / ws_normks, (self.batch_size, 1)) * ws_sign_ks else: raise NotImplementedError # if the xs_adv is adversarial, we do early stop. self.eqs = tf.equal(self.labels, self.ys_var) flags = tf.reshape(tf.cast(self.eqs, self.model.x_dtype) * (1 + self.overshot), (self.batch_size, 1)) xs_adv_next = self.xs_adv_var + flags * rs xs_adv_next = tf.clip_by_value(xs_adv_next, self.model.x_min, self.model.x_max) self.update_xs_adv_step = self.xs_adv_var.assign(xs_adv_next) self.setup = [ self.grads_var.initializer, self.xs_var.assign(tf.reshape(self.xs_ph, self.xs_var.shape)), self.xs_adv_var.assign(tf.reshape(self.xs_ph, self.xs_adv_var.shape)), self.ys_var.assign(self.ys_ph), ] self.setup_overshot = self.overshot.assign(self.overshot_ph) self.iteration_callback = None if iteration_callback is not None: xs_model = tf.reshape(self.xs_var, (self.batch_size, *self.model.x_shape)) xs_adv_model = tf.reshape(self.xs_adv_var, (self.batch_size, *self.model.x_shape)) self.iteration_callback = iteration_callback(xs_model, xs_adv_model) self.iteration = None self.details = {}
def __init__(self, model, batch_size, goal, distance_metric, decay_factor): assert isinstance(model, ClassifierDifferentiable) Attack.__init__(self, model=model, batch_size=batch_size) xs_shape = (self.batch_size, np.prod(self.model.x_shape)) ys_shape = (self.batch_size, ) model_xs_shape = (self.batch_size, *self.model.x_shape) xs_zeros = tf.zeros(xs_shape, dtype=self.model.x_dtype) self.xs_ph = get_xs_ph(self.model, self.batch_size) self.ys_ph = get_ys_ph(self.model, self.batch_size) self.eps_ph = tf.Variable(tf.zeros((self.batch_size, ))) self.alpha_ph = tf.Variable(tf.zeros((self.batch_size, ))) self.xs_var = tf.Variable(xs_zeros) self.ys_var = tf.Variable(tf.zeros(ys_shape, dtype=self.model.y_dtype)) self.g_var = tf.Variable(xs_zeros) self.eps_var = tf.Variable(tf.zeros((batch_size, ))) self.alpha_var = tf.Variable(tf.zeros((batch_size, ))) self.xs_adv_var = tf.Variable(xs_zeros) self.xs_adv = tf.reshape(self.xs_adv_var, model_xs_shape) logits, _ = self.model.logits_and_labels(self.xs_adv) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.ys_var, logits=logits) self.grad = tf.gradients(loss, self.xs_adv_var)[0] self.grad_l1 = tf.reduce_sum(tf.abs(self.grad), axis=1) self.config_setup = [ self.eps_var.assign(self.eps_ph), self.alpha_var.assign(self.alpha_ph), ] xs_ph_in = tf.reshape(self.xs_ph, xs_shape) self.setup = [ self.xs_var.assign(xs_ph_in), self.ys_var.assign(self.ys_ph), self.xs_adv_var.assign(xs_ph_in), tf.variables_initializer([self.g_var]), ] eps = tf.expand_dims(self.eps_var, 1) alpha = tf.expand_dims(self.alpha_var, 1) g_next = decay_factor * self.g_var + \ self.grad / tf.expand_dims(self.grad_l1, 1) self.update_g_step = self.g_var.assign(g_next) if goal == 't' or goal == 'tm': g = -self.g_var elif goal == 'ut': g = self.g_var else: raise NotImplementedError if distance_metric == 'l_2': g_norm = tf.maximum(1e-12, tf.norm(g, axis=1)) g_unit = g / tf.expand_dims(g_norm, 1) d = self.xs_adv_var + alpha * g_unit - self.xs_var xs_next = self.xs_var + tf.clip_by_norm(d, eps, axes=[1]) elif distance_metric == 'l_inf': lo, hi = self.xs_var - eps, self.xs_var + eps d = self.xs_adv_var + alpha * tf.sign(g) - self.xs_var xs_next = tf.clip_by_value(self.xs_var + d, lo, hi) else: raise NotImplementedError xs_next = tf.clip_by_value(xs_next, self.model.x_min, self.model.x_max) self.step = self.xs_adv_var.assign(xs_next) self.goal = goal
def __init__(self, model, batch_size, goal, distance_metric, learning_rate, confidence): assert isinstance(model, ClassifierDifferentiable) Attack.__init__(self, model, batch_size) self.confidence = confidence def scale(vec, dst_lo, dst_hi, src_lo, src_hi): k = (dst_hi - dst_lo) / (src_hi - src_lo) b = dst_lo - k * src_lo return k * vec + b def scale_to_model(vec): return scale(vec, self.model.x_min, self.model.x_max, -1.0, 1.0) def scale_to_tanh(vec): return scale(vec, 1e-6 - 1, 1 - 1e-6, self.model.x_min, self.model.x_max) model_xs_shape = (self.batch_size, *self.model.x_shape) xs_shape = (self.batch_size, np.prod(self.model.x_shape)) xs_zeros = tf.zeros(xs_shape, dtype=self.model.x_dtype) self.xs_ph = get_xs_ph(self.model, self.batch_size) self.ys_ph = get_ys_ph(self.model, self.batch_size) self.cs_ph = tf.placeholder(self.model.x_dtype, (self.batch_size, )) xs_var = tf.Variable(xs_zeros) ys_var = tf.Variable(tf.zeros_like(self.ys_ph)) cs_var = tf.Variable(tf.zeros_like(self.cs_ph)) d_ws = tf.Variable(xs_zeros) ws = tf.atanh(scale_to_tanh(xs_var)) + d_ws self.xs_adv = scale_to_model(tf.tanh(ws)) self.xs_adv_output = tf.reshape(self.xs_adv, model_xs_shape) logits, _ = self.model.logits_and_labels(self.xs_adv_output) ys_one_hot = tf.one_hot(ys_var, self.model.n_class) logit_target = tf.reduce_sum(ys_one_hot * logits, 1) logit_other = (1 - ys_one_hot) * logits logit_other = logit_other - 0.5 * self.model.x_dtype.max * ys_one_hot logit_other = tf.reduce_max(logit_other, 1) self.setup_xs = xs_var.assign(tf.reshape(self.xs_ph, xs_shape)) self.setup_ys = ys_var.assign(self.ys_ph) self.setup_cs = cs_var.assign(self.cs_ph) self.setup_d_ws = d_ws.assign(tf.zeros_like(d_ws)) if distance_metric == 'l_2': dists = tf.reduce_sum(tf.square(self.xs_adv - xs_var), axis=1) elif distance_metric == 'l_inf': dists = tf.reduce_max(tf.abs(self.xs_adv - xs_var), axis=1) else: raise NotImplementedError if goal == 't' or goal == 'tm': score = tf.maximum(0.0, logit_other - logit_target + confidence) elif goal == 'ut': score = tf.maximum(0.0, logit_target - logit_other + confidence) else: raise NotImplementedError self.goal = goal loss = dists + cs_var * score optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) self.optimizer_step = optimizer.minimize(loss, var_list=[d_ws]) self.setup_optimizer = tf.variables_initializer(optimizer.variables()) self.score = score self.logits = logits self.dists = dists
def __init__(self, model, batch_size, goal, distance_metric, random_start=False): # TODO random_start assert isinstance(model, ClassifierDifferentiable) Attack.__init__(self, model=model, batch_size=batch_size) xs_shape = (self.batch_size, np.prod(self.model.x_shape)) ys_shape = (self.batch_size, ) model_xs_shape = (self.batch_size, *self.model.x_shape) xs_zeros = tf.zeros(xs_shape, dtype=self.model.x_dtype) self.xs_ph = get_xs_ph(self.model, self.batch_size) self.ys_ph = get_ys_ph(self.model, self.batch_size) self.eps_ph = tf.Variable(tf.zeros((self.batch_size, ))) self.alpha_ph = tf.Variable(tf.zeros((self.batch_size, ))) self.xs_var = tf.Variable(xs_zeros) self.ys_var = tf.Variable(tf.zeros(ys_shape, dtype=self.model.y_dtype)) self.eps_var = tf.Variable(tf.zeros((batch_size, ))) self.alpha_var = tf.Variable(tf.zeros((batch_size, ))) self.xs_adv_var = tf.Variable(xs_zeros) self.xs_adv = tf.reshape(self.xs_adv_var, model_xs_shape) logits, _ = self.model.logits_and_labels(self.xs_adv) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.ys_var, logits=logits) self.grad = tf.gradients(loss, self.xs_adv_var)[0] self.config_setup = [ self.eps_var.assign(self.eps_ph), self.alpha_var.assign(self.alpha_ph), ] xs_ph_in = tf.reshape(self.xs_ph, xs_shape) self.setup = [ self.xs_var.assign(xs_ph_in), self.ys_var.assign(self.ys_ph), self.xs_adv_var.assign(xs_ph_in), ] eps = tf.expand_dims(self.eps_var, 1) alpha = tf.expand_dims(self.alpha_var, 1) if goal == 't' or goal == 'tm': grad = -self.grad elif goal == 'ut': grad = self.grad else: raise NotImplementedError if distance_metric == 'l_2': grad_norm = tf.maximum(1e-12, tf.norm(grad, axis=1)) grad_unit = grad / tf.expand_dims(grad_norm, 1) xs_next = self.xs_adv_var - self.xs_var + alpha * grad_unit xs_next = self.xs_var + tf.clip_by_norm(xs_next, eps, axes=[1]) elif distance_metric == 'l_inf': lo, hi = self.xs_var - eps, self.xs_var + eps xs_next = self.xs_adv_var + alpha * tf.sign(grad) xs_next = tf.clip_by_value(xs_next, lo, hi) else: raise NotImplementedError xs_next = tf.clip_by_value(xs_next, self.model.x_min, self.model.x_max) self.step = self.xs_adv_var.assign(xs_next) self.goal = goal
def __init__(self, model, batch_size, goal, distance_metric, session, cw_loss_c=99999.0, confidence=0.0, learning_rate=0.01): ''' Initialize CW. :param model: The model to attack. A `realsafe.model.ClassifierWithLogits` instance. :param batch_size: Batch size for the `batch_attack()` method. :param goal: Adversarial goals. All supported values are 't', 'tm', and 'ut'. :param session: The `tf.Session` to run the attack in. The `model` should be loaded into this session. :param cw_loss_c: The `c` parameter for `realsafe.loss.CWLoss`. :param confidence: The minimum margin between the target logit and the second largest logit that we consider the example as adversarial. :param learning_rate: Learning rate for the `AdamOptimizer`. ''' self.model, self.batch_size, self._session = model, batch_size, session self.goal, self.distance_metric = goal, distance_metric self.confidence = confidence # flatten shape of xs_ph xs_shape_flatten = (self.batch_size, np.prod(self.model.x_shape)) # placeholder for batch_attack's input self.xs_ph, self.ys_ph = get_xs_ph(model, self.batch_size), get_ys_ph( model, self.batch_size) # store adversarial examples and labels in variables to reduce memory copy between tensorflow and python xs_var = tf.Variable( tf.zeros(shape=xs_shape_flatten, dtype=self.model.x_dtype)) ys_var = tf.Variable(tf.zeros_like(self.ys_ph)) # placeholder for c self.cs_ph = tf.placeholder(self.model.x_dtype, (self.batch_size, )) cs_var = tf.Variable(tf.zeros_like(self.cs_ph)) # xs_adv = tanh(ws) d_ws = tf.Variable( tf.zeros(shape=xs_shape_flatten, dtype=self.model.x_dtype)) ws = tf.atanh(self._scale_to_tanh(xs_var)) + d_ws self.xs_adv = self._scale_to_model(tf.tanh(ws)) self.xs_adv_model = tf.reshape(self.xs_adv, (self.batch_size, *self.model.x_shape)) # the C&W loss term cw_loss = CWLoss(self.model)(self.xs_adv_model, ys_var) self.logits = self.model.logits(self.xs_adv_model) if self.goal == 't' or self.goal == 'tm': self.score = tf.maximum(0.0, cw_loss + confidence) elif self.goal == 'ut': self.score = tf.maximum(0.0, tf.negative(cw_loss) + confidence) else: raise NotImplementedError # the distance term if self.distance_metric == 'l_2': self.dists = tf.reduce_sum(tf.square(self.xs_adv - xs_var), axis=1) else: raise NotImplementedError # the loss loss = self.dists + cs_var * self.score # minimize the loss using Adam optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) self.optimizer_step = optimizer.minimize(loss, var_list=[d_ws]) self.setup_optimizer = tf.variables_initializer(optimizer.variables()) self.setup_xs = xs_var.assign(tf.reshape(self.xs_ph, xs_shape_flatten)) self.setup_ys = ys_var.assign(self.ys_ph) self.setup_cs = cs_var.assign(self.cs_ph) self.setup_d_ws = d_ws.assign(tf.zeros_like(d_ws)) # provides default values self.iteration = 50 self.search_steps = 2 self.binsearch_steps = 10 self.details = {} self.logger = None