def __call__( # type: ignore self, model: Model, inputs: T, criterion: Any, *, epsilons: Union[Sequence[Union[float, None]], float, None], **kwargs: Any, ) -> Union[Tuple[List[T], List[T], T], Tuple[T, T, T]]: x, restore_type = ep.astensor_(inputs) del inputs criterion = get_criterion(criterion) is_adversarial = get_is_adversarial(criterion, model) was_iterable = True if not isinstance(epsilons, Iterable): epsilons = [epsilons] was_iterable = False N = len(x) K = len(epsilons) # None means: just minimize, no early stopping, no limit on the perturbation size if any(eps is None for eps in epsilons): early_stop = None else: early_stop = min(epsilons) # run the actual attack xp = self.run(model, x, criterion, early_stop=early_stop, **kwargs) xpcs = [] success = [] for epsilon in epsilons: start = timer() if epsilon is None: xpc = xp else: xpc = self.distance.clip_perturbation(x, xp, epsilon) is_adv = is_adversarial(xpc) xpcs.append(xpc) success.append(is_adv) end = timer() print(end-start) success_ = ep.stack(success) assert success_.shape == (K, N) xp_ = restore_type(xp) xpcs_ = [restore_type(xpc) for xpc in xpcs] if was_iterable: return [xp_] * K, xpcs_, restore_type(success_) else: assert len(xpcs_) == 1 return xp_, xpcs_[0], restore_type(success_.squeeze(axis=0))
def approximate_gradients( self, is_adversarial: Callable[[ep.Tensor], ep.Tensor], x_advs: ep.Tensor, steps: int, delta: ep.Tensor, ) -> ep.Tensor: # (steps, bs, ...) noise_shape = tuple([steps] + list(x_advs.shape)) if self.constraint == "l2": rv = ep.normal(x_advs, noise_shape) elif self.constraint == "linf": rv = ep.uniform(x_advs, low=-1, high=1, shape=noise_shape) rv /= atleast_kd(ep.norms.l2(flatten(rv, keep=1), -1), rv.ndim) + 1e-12 scaled_rv = atleast_kd(ep.expand_dims(delta, 0), rv.ndim) * rv perturbed = ep.expand_dims(x_advs, 0) + scaled_rv perturbed = ep.clip(perturbed, 0, 1) rv = (perturbed - x_advs) / atleast_kd(ep.expand_dims(delta + 1e-8, 0), rv.ndim) multipliers_list: List[ep.Tensor] = [] for step in range(steps): decision = is_adversarial(perturbed[step]) multipliers_list.append( ep.where( decision, ep.ones( x_advs, (len(x_advs, )), ), -ep.ones( x_advs, (len(decision, )), ), )) # (steps, bs, ...) multipliers = ep.stack(multipliers_list, 0) vals = ep.where( ep.abs(ep.mean(multipliers, axis=0, keepdims=True)) == 1, multipliers, multipliers - ep.mean(multipliers, axis=0, keepdims=True), ) grad = ep.mean(atleast_kd(vals, rv.ndim) * rv, axis=0) grad /= ep.norms.l2(atleast_kd(flatten(grad), grad.ndim)) + 1e-12 return grad
def run( self, model: Model, inputs: T, criterion: Union[Criterion, T], *, early_stop: Optional[float] = None, **kwargs: Any, ) -> T: #raise_if_kwargs(kwargs) x, restore_type = ep.astensor_(inputs) del inputs, kwargs verify_input_bounds(x, model) criterion = get_criterion(criterion) min_, max_ = model.bounds logits = model(x) classes = logits.argsort(axis=-1).flip(axis=-1) if self.candidates is None: candidates = logits.shape[-1] # pragma: no cover else: candidates = min(self.candidates, logits.shape[-1]) if not candidates >= 2: raise ValueError( # pragma: no cover f"expected the model output to have atleast 2 classes, got {logits.shape[-1]}" ) logging.info(f"Only testing the top-{candidates} classes") classes = classes[:, :candidates] N = len(x) rows = range(N) loss_fun = self._get_loss_fn(model, classes) loss_aux_and_grad = ep.value_and_grad_fn(x, loss_fun, has_aux=True) x0 = x p_total = ep.zeros_like(x) for _ in range(self.steps): # let's first get the logits using k = 1 to see if we are done diffs = [loss_aux_and_grad(x, 1)] _, (_, logits), _ = diffs[0] is_adv = criterion(x, logits) if is_adv.all(): break # then run all the other k's as well # we could avoid repeated forward passes and only repeat # the backward pass, but this cannot currently be done in eagerpy diffs += [loss_aux_and_grad(x, k) for k in range(2, candidates)] # we don't need the logits diffs_ = [(losses, grad) for _, (losses, _), grad in diffs] losses = ep.stack([lo for lo, _ in diffs_], axis=1) grads = ep.stack([g for _, g in diffs_], axis=1) assert losses.shape == (N, candidates - 1) assert grads.shape == (N, candidates - 1) + x0.shape[1:] # calculate the distances distances = self.get_distances(losses, grads) assert distances.shape == (N, candidates - 1) # determine the best directions best = distances.argmin(axis=1) distances = distances[rows, best] losses = losses[rows, best] grads = grads[rows, best] assert distances.shape == (N,) assert losses.shape == (N,) assert grads.shape == x0.shape # apply perturbation distances = distances + 1e-4 # for numerical stability p_step = self.get_perturbations(distances, grads) assert p_step.shape == x0.shape p_total += p_step # don't do anything for those that are already adversarial x = ep.where( atleast_kd(is_adv, x.ndim), x, x0 + (1.0 + self.overshoot) * p_total ) x = ep.clip(x, min_, max_) return restore_type(x)
def test_list_stack(t: Tensor) -> None: t2 = ep.stack(list(t)) assert t.shape == t2.shape assert (t == t2).all()
def test_stack(t1: Tensor, t2: Tensor, axis: int) -> Tensor: return ep.stack([t1, t2], axis=axis)
def run( self, model: Model, inputs: T, criterion: TargetedMisclassification, *, epsilon: float, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) x, restore_type = ep.astensor_(inputs) del inputs, kwargs N = len(x) if isinstance(criterion, TargetedMisclassification): classes = criterion.target_classes else: raise ValueError("unsupported criterion") if classes.shape != (N, ): raise ValueError( f"expected target_classes to have shape ({N},), got {classes.shape}" ) noise_shape: Union[Tuple[int, int, int, int], Tuple[int, ...]] channel_axis: Optional[int] = None if self.reduced_dims is not None: if x.ndim != 4: raise NotImplementedError( "only implemented for inputs with two spatial dimensions" " (and one channel and one batch dimension)") if self.channel_axis is None: maybe_axis = get_channel_axis(model, x.ndim) if maybe_axis is None: raise ValueError( "cannot infer the data_format from the model, please" " specify channel_axis when initializing the attack") else: channel_axis = maybe_axis else: channel_axis = self.channel_axis % x.ndim if channel_axis == 1: noise_shape = (x.shape[1], *self.reduced_dims) elif channel_axis == 3: noise_shape = (*self.reduced_dims, x.shape[3]) else: raise ValueError( "expected 'channel_axis' to be 1 or 3, got {channel_axis}") else: noise_shape = x.shape[1:] # pragma: no cover def is_adversarial(logits: ep.TensorType) -> ep.TensorType: return ep.argmax(logits, 1) == classes num_plateaus = ep.zeros(x, len(x)) mutation_probability = (ep.ones_like(num_plateaus) * self.min_mutation_probability) mutation_range = ep.ones_like(num_plateaus) * self.min_mutation_range noise_pops = ep.uniform(x, (N, self.population, *noise_shape), -epsilon, epsilon) def calculate_fitness(logits: ep.TensorType) -> ep.TensorType: first = logits[range(N), classes] second = ep.log(ep.exp(logits).sum(1) - first) return first - second n_its_wo_change = ep.zeros(x, (N, )) for step in range(self.steps): fitness_l, is_adv_l = [], [] for i in range(self.population): it = self.apply_noise(x, noise_pops[:, i], epsilon, channel_axis) logits = model(it) f = calculate_fitness(logits) a = is_adversarial(logits) fitness_l.append(f) is_adv_l.append(a) fitness = ep.stack(fitness_l) is_adv = ep.stack(is_adv_l, 1) elite_idxs = ep.argmax(fitness, 0) elite_noise = noise_pops[range(N), elite_idxs] is_adv = is_adv[range(N), elite_idxs] # early stopping if is_adv.all(): return restore_type( # pragma: no cover self.apply_noise(x, elite_noise, epsilon, channel_axis)) probs = ep.softmax(fitness / self.sampling_temperature, 0) parents_idxs = np.stack( [ self.choice( self.population, 2 * self.population - 2, replace=True, p=probs[:, i], ) for i in range(N) ], 1, ) mutations = [ ep.uniform( x, noise_shape, -mutation_range[i].item() * epsilon, mutation_range[i].item() * epsilon, ) for i in range(N) ] new_noise_pops = [elite_noise] for i in range(0, self.population - 1): parents_1 = noise_pops[range(N), parents_idxs[2 * i]] parents_2 = noise_pops[range(N), parents_idxs[2 * i + 1]] # calculate crossover p = probs[parents_idxs[2 * i], range(N)] / ( probs[parents_idxs[2 * i], range(N)] + probs[parents_idxs[2 * i + 1], range(N)]) p = atleast_kd(p, x.ndim) p = ep.tile(p, (1, *noise_shape)) crossover_mask = ep.uniform(p, p.shape, 0, 1) < p children = ep.where(crossover_mask, parents_1, parents_2) # calculate mutation mutation_mask = ep.uniform(children, children.shape) mutation_mask = mutation_mask <= atleast_kd( mutation_probability, children.ndim) children = ep.where(mutation_mask, children + mutations[i], children) # project back to epsilon range children = ep.clip(children, -epsilon, epsilon) new_noise_pops.append(children) noise_pops = ep.stack(new_noise_pops, 1) # increase num_plateaus if fitness does not improve # for 100 consecutive steps n_its_wo_change = ep.where(elite_idxs == 0, n_its_wo_change + 1, ep.zeros_like(n_its_wo_change)) num_plateaus = ep.where(n_its_wo_change >= 100, num_plateaus + 1, num_plateaus) n_its_wo_change = ep.where(n_its_wo_change >= 100, ep.zeros_like(n_its_wo_change), n_its_wo_change) mutation_probability = ep.maximum( self.min_mutation_probability, 0.5 * ep.exp( math.log(0.9) * ep.ones_like(num_plateaus) * num_plateaus), ) mutation_range = ep.maximum( self.min_mutation_range, 0.5 * ep.exp( math.log(0.9) * ep.ones_like(num_plateaus) * num_plateaus), ) return restore_type( self.apply_noise(x, elite_noise, epsilon, channel_axis))
def __call__( self, inputs, labels, *, p, candidates=10, overshoot=0.02, steps=50, loss="logits", ): """ Parameters ---------- p : int or float Lp-norm that should be minimzed, must be 2 or np.inf. candidates : int Limit on the number of the most likely classes that should be considered. A small value is usually sufficient and much faster. overshoot : float steps : int Maximum number of steps to perform. """ if not (1 <= p <= np.inf): raise ValueError if p not in [2, np.inf]: raise NotImplementedError min_, max_ = self.model.bounds() inputs = ep.astensor(inputs) labels = ep.astensor(labels) N = len(inputs) logits = self.model.forward(inputs) candidates = min(candidates, logits.shape[-1]) classes = logits.argsort(axis=-1).flip(axis=-1) if candidates: assert candidates >= 2 logging.info(f"Only testing the top-{candidates} classes") classes = classes[:, :candidates] i0 = classes[:, 0] rows = ep.arange(inputs, N) if loss == "logits": def loss_fun(x: ep.Tensor, k: int) -> ep.Tensor: logits = self.model.forward(x) ik = classes[:, k] l0 = logits[rows, i0] lk = logits[rows, ik] loss = lk - l0 return loss.sum(), (loss, logits) elif loss == "crossentropy": def loss_fun(x: ep.Tensor, k: int) -> ep.Tensor: logits = self.model.forward(x) ik = classes[:, k] l0 = -ep.crossentropy(logits, i0) lk = -ep.crossentropy(logits, ik) loss = lk - l0 return loss.sum(), (loss, logits) else: raise ValueError( f"expected loss to be 'logits' or 'crossentropy', got '{loss}'" ) loss_aux_and_grad = ep.value_and_grad_fn(inputs, loss_fun, has_aux=True) x = x0 = inputs p_total = ep.zeros_like(x) for step in range(steps): # let's first get the logits using k = 1 to see if we are done diffs = [loss_aux_and_grad(x, 1)] _, (_, logits), _ = diffs[0] is_adv = logits.argmax(axis=-1) != labels if is_adv.all(): break # then run all the other k's as well # we could avoid repeated forward passes and only repeat # the backward pass, but this cannot currently be done in eagerpy diffs += [loss_aux_and_grad(x, k) for k in range(2, candidates)] # we don't need the logits diffs = [(losses, grad) for _, (losses, _), grad in diffs] losses = ep.stack([l for l, _ in diffs], axis=1) grads = ep.stack([g for _, g in diffs], axis=1) assert losses.shape == (N, candidates - 1) assert grads.shape == (N, candidates - 1) + x0.shape[1:] # calculate the distances distances = self.get_distances(losses, grads) assert distances.shape == (N, candidates - 1) # determine the best directions best = distances.argmin(axis=1) distances = distances[rows, best] losses = losses[rows, best] grads = grads[rows, best] assert distances.shape == (N, ) assert losses.shape == (N, ) assert grads.shape == x0.shape # apply perturbation distances = distances + 1e-4 # for numerical stability p_step = self.get_perturbations(distances, grads) assert p_step.shape == x0.shape p_total += p_step # don't do anything for those that are already adversarial x = ep.where(atleast_kd(is_adv, x.ndim), x, x0 + (1.0 + overshoot) * p_total) x = ep.clip(x, min_, max_) return x.tensor
def __call__( # type: ignore self, model: Model, inputs: T, criterion: Any, *, epsilons: Union[Sequence[Union[float, None]], float, None], **kwargs: Any, ) -> Union[Tuple[List[T], List[T], T], Tuple[T, T, T]]: x, restore_type = ep.astensor_(inputs) del inputs verify_input_bounds(x, model) criterion = get_criterion(criterion) is_adversarial = get_is_adversarial(criterion, model) was_iterable = True if not isinstance(epsilons, Iterable): epsilons = [epsilons] was_iterable = False N = len(x) K = len(epsilons) # None means: just minimize, no early stopping, no limit on the perturbation size if any(eps is None for eps in epsilons): # TODO: implement a binary search raise NotImplementedError( "FixedEpsilonAttack subclasses do not yet support None in epsilons" ) real_epsilons = [eps for eps in epsilons if eps is not None] del epsilons xps = [] xpcs = [] success = [] for epsilon in real_epsilons: xp = self.run(model, x, criterion, epsilon=epsilon, **kwargs) # clip to epsilon because we don't really know what the attack returns; # alternatively, we could check if the perturbation is at most epsilon, # but then we would need to handle numerical violations; xpc = self.distance.clip_perturbation(x, xp, epsilon) is_adv = is_adversarial(xpc) xps.append(xp) xpcs.append(xpc) success.append(is_adv) # # TODO: the correction we apply here should make sure that the limits # # are not violated, but this is a hack and we need a better solution # # Alternatively, maybe can just enforce the limits in __call__ # xps = [ # self.run(model, x, criterion, epsilon=epsilon, **kwargs) # for epsilon in real_epsilons # ] # is_adv = ep.stack([is_adversarial(xp) for xp in xps]) # assert is_adv.shape == (K, N) # in_limits = ep.stack( # [ # self.distance(x, xp) <= epsilon # for xp, epsilon in zip(xps, real_epsilons) # ], # ) # assert in_limits.shape == (K, N) # if not in_limits.all(): # # TODO handle (numerical) violations # # warn user if run() violated the epsilon constraint # import pdb # pdb.set_trace() # success = ep.logical_and(in_limits, is_adv) # assert success.shape == (K, N) success_ = ep.stack(success) assert success_.shape == (K, N) xps_ = [restore_type(xp) for xp in xps] xpcs_ = [restore_type(xpc) for xpc in xpcs] if was_iterable: return xps_, xpcs_, restore_type(success_) else: assert len(xps_) == 1 assert len(xpcs_) == 1 return xps_[0], xpcs_[0], restore_type(success_.squeeze(axis=0))
def forward(self, x): res = [] for model in self.models: res.append(model(x)) return ep.stack(res, 1).mean(1)
def run( self, model: Model, inputs: T, criterion: Union[Criterion, T], *, early_stop: Optional[float] = None, starting_points: Optional[T] = None, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) originals, restore_type = ep.astensor_(inputs) del inputs, kwargs verify_input_bounds(originals, model) criterion = get_criterion(criterion) is_adversarial = get_is_adversarial(criterion, model) if starting_points is None: init_attack: MinimizationAttack if self.init_attack is None: init_attack = LinearSearchBlendedUniformNoiseAttack(steps=50) logging.info( f"Neither starting_points nor init_attack given. Falling" f" back to {init_attack!r} for initialization.") else: init_attack = self.init_attack # TODO: use call and support all types of attacks (once early_stop is # possible in __call__) x_advs = init_attack.run(model, originals, criterion, early_stop=early_stop) else: x_advs = ep.astensor(starting_points) is_adv = is_adversarial(x_advs) if not is_adv.all(): failed = is_adv.logical_not().float32().sum() if starting_points is None: raise ValueError( f"init_attack failed for {failed} of {len(is_adv)} inputs") else: raise ValueError( f"{failed} of {len(is_adv)} starting_points are not adversarial" ) del starting_points tb = TensorBoard(logdir=self.tensorboard) # Project the initialization to the boundary. x_advs = self._binary_search(is_adversarial, originals, x_advs) assert ep.all(is_adversarial(x_advs)) distances = self.distance(originals, x_advs) for step in range(self.steps): delta = self.select_delta(originals, distances, step) # Choose number of gradient estimation steps. num_gradient_estimation_steps = int( min([ self.initial_num_evals * math.sqrt(step + 1), self.max_num_evals ])) gradients = self.approximate_gradients( is_adversarial, x_advs, num_gradient_estimation_steps, delta) if self.constraint == "linf": update = ep.sign(gradients) else: update = gradients if self.stepsize_search == "geometric_progression": # find step size. epsilons = distances / math.sqrt(step + 1) while True: x_advs_proposals = ep.clip( x_advs + atleast_kd(epsilons, x_advs.ndim) * update, 0, 1) success = is_adversarial(x_advs_proposals) epsilons = ep.where(success, epsilons, epsilons / 2.0) if ep.all(success): break # Update the sample. x_advs = ep.clip( x_advs + atleast_kd(epsilons, update.ndim) * update, 0, 1) assert ep.all(is_adversarial(x_advs)) # Binary search to return to the boundary. x_advs = self._binary_search(is_adversarial, originals, x_advs) assert ep.all(is_adversarial(x_advs)) elif self.stepsize_search == "grid_search": # Grid search for stepsize. epsilons_grid = ep.expand_dims( ep.from_numpy( distances, np.logspace( -4, 0, num=20, endpoint=True, dtype=np.float32), ), 1, ) * ep.expand_dims(distances, 0) proposals_list = [] for epsilons in epsilons_grid: x_advs_proposals = ( x_advs + atleast_kd(epsilons, update.ndim) * update) x_advs_proposals = ep.clip(x_advs_proposals, 0, 1) mask = is_adversarial(x_advs_proposals) x_advs_proposals = self._binary_search( is_adversarial, originals, x_advs_proposals) # only use new values where initial guess was already adversarial x_advs_proposals = ep.where(atleast_kd(mask, x_advs.ndim), x_advs_proposals, x_advs) proposals_list.append(x_advs_proposals) proposals = ep.stack(proposals_list, 0) proposals_distances = self.distance( ep.expand_dims(originals, 0), proposals) minimal_idx = ep.argmin(proposals_distances, 0) x_advs = proposals[minimal_idx] distances = self.distance(originals, x_advs) # log stats tb.histogram("norms", distances, step) return restore_type(x_advs)