def project(self, x: ep.Tensor, x0: ep.Tensor, epsilon: ep.Tensor) -> ep.Tensor: flatten_delta = flatten(x - x0) abs_delta = abs(flatten_delta) epsilon = epsilon.astype(int) rows = range(flatten_delta.shape[0]) idx_sorted = ep.argsort(abs_delta, axis=-1)[rows, -epsilon] thresholds = (ep.ones_like(flatten_delta).T * abs_delta[rows, idx_sorted]).T clipped = ep.where(abs_delta >= thresholds, flatten_delta, 0) return x0 + clipped.reshape(x0.shape).astype(x0.dtype)
def mid_points( self, x0: ep.Tensor, x1: ep.Tensor, epsilons: ep.Tensor, bounds: Tuple[float, float], ): # returns a point between x0 and x1 where # epsilon = 0 returns x0 and epsilon = 1 # returns x1 # epsilons here will be the percentage of features to keep n_features = flatten(ep.ones_like(x0)).bool().sum(axis=1).float32() new_x = self.project(x1, x0, n_features * epsilons) return new_x
def select_delta(self, originals: ep.Tensor, distances: ep.Tensor, step: int) -> ep.Tensor: result: ep.Tensor if step == 0: result = 0.1 * ep.ones_like(distances) else: d = np.prod(originals.shape[1:]) if self.constraint == "linf": theta = self.gamma / (d * d) result = d * theta * distances else: theta = self.gamma / (d * np.sqrt(d)) result = np.sqrt(d) * theta * distances return result
def test_logical_and_manual(t: Tensor) -> None: assert (ep.logical_and(t < 3, ep.ones_like(t).bool()) == (t < 3)).all()
def test_ones_like(t: Tensor) -> Tensor: return ep.ones_like(t)
def run( self, model: Model, inputs: T, criterion: TargetedMisclassification, *, epsilon: float, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) x, restore_type = ep.astensor_(inputs) del inputs, kwargs N = len(x) if isinstance(criterion, TargetedMisclassification): classes = criterion.target_classes else: raise ValueError("unsupported criterion") if classes.shape != (N, ): raise ValueError( f"expected target_classes to have shape ({N},), got {classes.shape}" ) noise_shape: Union[Tuple[int, int, int, int], Tuple[int, ...]] channel_axis: Optional[int] = None if self.reduced_dims is not None: if x.ndim != 4: raise NotImplementedError( "only implemented for inputs with two spatial dimensions" " (and one channel and one batch dimension)") if self.channel_axis is None: maybe_axis = get_channel_axis(model, x.ndim) if maybe_axis is None: raise ValueError( "cannot infer the data_format from the model, please" " specify channel_axis when initializing the attack") else: channel_axis = maybe_axis else: channel_axis = self.channel_axis % x.ndim if channel_axis == 1: noise_shape = (x.shape[1], *self.reduced_dims) elif channel_axis == 3: noise_shape = (*self.reduced_dims, x.shape[3]) else: raise ValueError( "expected 'channel_axis' to be 1 or 3, got {channel_axis}") else: noise_shape = x.shape[1:] # pragma: no cover def is_adversarial(logits: ep.TensorType) -> ep.TensorType: return ep.argmax(logits, 1) == classes num_plateaus = ep.zeros(x, len(x)) mutation_probability = (ep.ones_like(num_plateaus) * self.min_mutation_probability) mutation_range = ep.ones_like(num_plateaus) * self.min_mutation_range noise_pops = ep.uniform(x, (N, self.population, *noise_shape), -epsilon, epsilon) def calculate_fitness(logits: ep.TensorType) -> ep.TensorType: first = logits[range(N), classes] second = ep.log(ep.exp(logits).sum(1) - first) return first - second n_its_wo_change = ep.zeros(x, (N, )) for step in range(self.steps): fitness_l, is_adv_l = [], [] for i in range(self.population): it = self.apply_noise(x, noise_pops[:, i], epsilon, channel_axis) logits = model(it) f = calculate_fitness(logits) a = is_adversarial(logits) fitness_l.append(f) is_adv_l.append(a) fitness = ep.stack(fitness_l) is_adv = ep.stack(is_adv_l, 1) elite_idxs = ep.argmax(fitness, 0) elite_noise = noise_pops[range(N), elite_idxs] is_adv = is_adv[range(N), elite_idxs] # early stopping if is_adv.all(): return restore_type( # pragma: no cover self.apply_noise(x, elite_noise, epsilon, channel_axis)) probs = ep.softmax(fitness / self.sampling_temperature, 0) parents_idxs = np.stack( [ self.choice( self.population, 2 * self.population - 2, replace=True, p=probs[:, i], ) for i in range(N) ], 1, ) mutations = [ ep.uniform( x, noise_shape, -mutation_range[i].item() * epsilon, mutation_range[i].item() * epsilon, ) for i in range(N) ] new_noise_pops = [elite_noise] for i in range(0, self.population - 1): parents_1 = noise_pops[range(N), parents_idxs[2 * i]] parents_2 = noise_pops[range(N), parents_idxs[2 * i + 1]] # calculate crossover p = probs[parents_idxs[2 * i], range(N)] / ( probs[parents_idxs[2 * i], range(N)] + probs[parents_idxs[2 * i + 1], range(N)]) p = atleast_kd(p, x.ndim) p = ep.tile(p, (1, *noise_shape)) crossover_mask = ep.uniform(p, p.shape, 0, 1) < p children = ep.where(crossover_mask, parents_1, parents_2) # calculate mutation mutation_mask = ep.uniform(children, children.shape) mutation_mask = mutation_mask <= atleast_kd( mutation_probability, children.ndim) children = ep.where(mutation_mask, children + mutations[i], children) # project back to epsilon range children = ep.clip(children, -epsilon, epsilon) new_noise_pops.append(children) noise_pops = ep.stack(new_noise_pops, 1) # increase num_plateaus if fitness does not improve # for 100 consecutive steps n_its_wo_change = ep.where(elite_idxs == 0, n_its_wo_change + 1, ep.zeros_like(n_its_wo_change)) num_plateaus = ep.where(n_its_wo_change >= 100, num_plateaus + 1, num_plateaus) n_its_wo_change = ep.where(n_its_wo_change >= 100, ep.zeros_like(n_its_wo_change), n_its_wo_change) mutation_probability = ep.maximum( self.min_mutation_probability, 0.5 * ep.exp( math.log(0.9) * ep.ones_like(num_plateaus) * num_plateaus), ) mutation_range = ep.maximum( self.min_mutation_range, 0.5 * ep.exp( math.log(0.9) * ep.ones_like(num_plateaus) * num_plateaus), ) return restore_type( self.apply_noise(x, elite_noise, epsilon, channel_axis))
def run( self, model: Model, inputs: T, criterion: Union[Criterion, T], *, early_stop: Optional[float] = None, starting_points: Optional[T] = None, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) if starting_points is None: raise ValueError("BinarizationRefinementAttack requires starting_points") (o, x), restore_type = ep.astensors_(inputs, starting_points) del inputs, starting_points, kwargs criterion = get_criterion(criterion) is_adversarial = get_is_adversarial(criterion, model) if self.threshold is None: min_, max_ = model.bounds threshold = (min_ + max_) / 2.0 else: threshold = self.threshold assert o.dtype == x.dtype nptype = o.reshape(-1)[0].numpy().dtype.type if nptype not in [np.float16, np.float32, np.float64]: raise ValueError( # pragma: no cover f"expected dtype to be float16, float32 or float64, found '{nptype}'" ) threshold = nptype(threshold) offset = nptype(1.0) if self.included_in == "lower": lower_ = threshold upper_ = np.nextafter(threshold, threshold + offset) elif self.included_in == "upper": lower_ = np.nextafter(threshold, threshold - offset) upper_ = threshold else: raise ValueError( f"expected included_in to be 'lower' or 'upper', found '{self.included_in}'" ) assert lower_ < upper_ p = ep.full_like(o, ep.nan) lower = ep.ones_like(o) * lower_ upper = ep.ones_like(o) * upper_ indices = ep.logical_and(o <= lower, x <= lower) p = ep.where(indices, o, p) indices = ep.logical_and(o <= lower, x >= upper) p = ep.where(indices, upper, p) indices = ep.logical_and(o >= upper, x <= lower) p = ep.where(indices, lower, p) indices = ep.logical_and(o >= upper, x >= upper) p = ep.where(indices, o, p) assert not ep.any(ep.isnan(p)) is_adv1 = is_adversarial(x) is_adv2 = is_adversarial(p) if (is_adv1 != is_adv2).any(): raise ValueError( "The specified threshold does not match what is done by the model." ) return restore_type(p)
def run( self, model: Model, inputs: T, criterion: Union[Criterion, T], *, early_stop: Optional[float] = None, starting_points: Optional[T] = None, **kwargs: Any, ) -> T: """For models that preprocess their inputs by binarizing the inputs, this attack can improve adversarials found by other attacks. It does this by utilizing information about the binarization and mapping values to the corresponding value in the clean input or to the right side of the threshold. Parameters ---------- threshold : float The treshold used by the models binarization. If none, defaults to (model.bounds()[1] - model.bounds()[0]) / 2. included_in : str Whether the threshold value itself belongs to the lower or upper interval. """ raise_if_kwargs(kwargs) if starting_points is None: raise ValueError( "BinarizationRefinementAttack requires starting_points") (o, x), restore_type = ep.astensors_(inputs, starting_points) del inputs, starting_points, kwargs criterion = get_criterion(criterion) is_adversarial = get_is_adversarial(criterion, model) if self.threshold is None: min_, max_ = model.bounds threshold = (min_ + max_) / 2.0 else: threshold = self.threshold assert o.dtype == x.dtype nptype = o.reshape(-1)[0].numpy().dtype.type if nptype not in [np.float16, np.float32, np.float64]: raise ValueError( # pragma: no cover f"expected dtype to be float16, float32 or float64, found '{nptype}'" ) threshold = nptype(threshold) offset = nptype(1.0) if self.included_in == "lower": lower_ = threshold upper_ = np.nextafter(threshold, threshold + offset) elif self.included_in == "upper": lower_ = np.nextafter(threshold, threshold - offset) upper_ = threshold else: raise ValueError( f"expected included_in to be 'lower' or 'upper', found '{self.included_in}'" ) assert lower_ < upper_ p = ep.full_like(o, ep.nan) lower = ep.ones_like(o) * lower_ upper = ep.ones_like(o) * upper_ indices = ep.logical_and(o <= lower, x <= lower) p = ep.where(indices, o, p) indices = ep.logical_and(o <= lower, x >= upper) p = ep.where(indices, upper, p) indices = ep.logical_and(o >= upper, x <= lower) p = ep.where(indices, lower, p) indices = ep.logical_and(o >= upper, x >= upper) p = ep.where(indices, o, p) assert not ep.any(ep.isnan(p)) is_adv1 = is_adversarial(x) is_adv2 = is_adversarial(p) if (is_adv1 != is_adv2).any(): raise ValueError( "The specified threshold does not match what is done by the model." ) return restore_type(p)
def __call__( self, inputs, labels, *, adversarials, criterion, threshold=None, included_in="upper", ): """For models that preprocess their inputs by binarizing the inputs, this attack can improve adversarials found by other attacks. It does this by utilizing information about the binarization and mapping values to the corresponding value in the clean input or to the right side of the threshold. Parameters ---------- threshold : float The treshold used by the models binarization. If none, defaults to (model.bounds()[1] - model.bounds()[0]) / 2. included_in : str Whether the threshold value itself belongs to the lower or upper interval. """ originals = ep.astensor(inputs) labels = ep.astensor(labels) def is_adversarial(p: ep.Tensor) -> ep.Tensor: """For each input in x, returns true if it is an adversarial for the given model and criterion""" logits = ep.astensor(self.model.forward(p.tensor)) return criterion(originals, labels, p, logits) o = ep.astensor(inputs) x = ep.astensor(adversarials) min_, max_ = self.model.bounds() if threshold is None: threshold = (min_ + max_) / 2.0 assert o.dtype == x.dtype dtype = o.dtype if dtype == o.backend.float16: nptype = np.float16 elif dtype == o.backend.float32: nptype = np.float32 elif dtype == o.backend.float64: nptype = np.float64 else: raise ValueError( "expected dtype to be float16, float32 or float64, found '{dtype}'" ) threshold = nptype(threshold) offset = nptype(1.0) if included_in == "lower": lower = threshold upper = np.nextafter(threshold, threshold + offset) elif included_in == "upper": lower = np.nextafter(threshold, threshold - offset) upper = threshold else: raise ValueError( "expected included_in to be 'lower' or 'upper', found '{included_in}'" ) assert lower < upper p = ep.full_like(o, ep.nan) lower = ep.ones_like(o) * lower upper = ep.ones_like(o) * upper indices = ep.logical_and(o <= lower, x <= lower) p = ep.where(indices, o, p) indices = ep.logical_and(o <= lower, x >= upper) p = ep.where(indices, upper, p) indices = ep.logical_and(o >= upper, x <= lower) p = ep.where(indices, lower, p) indices = ep.logical_and(o >= upper, x >= upper) p = ep.where(indices, o, p) assert not ep.any(ep.isnan(p)) is_adv1 = is_adversarial(x) is_adv2 = is_adversarial(p) assert (is_adv1 == is_adv2).all( ), "The specified threshold does not match what is done by the model." return p.tensor
def run( self, model: Model, inputs: T, criterion: Union[Misclassification, TargetedMisclassification, T], *, starting_points: Optional[ep.Tensor] = None, early_stop: Optional[float] = None, **kwargs: Any, ) -> T: raise_if_kwargs(kwargs) criterion_ = get_criterion(criterion) if isinstance(criterion_, Misclassification): targeted = False classes = criterion_.labels elif isinstance(criterion_, TargetedMisclassification): targeted = True classes = criterion_.target_classes else: raise ValueError("unsupported criterion") def loss_fn( inputs: ep.Tensor, labels: ep.Tensor ) -> Tuple[ep.Tensor, Tuple[ep.Tensor, ep.Tensor]]: logits = model(inputs) if targeted: c_minimize = best_other_classes(logits, labels) c_maximize = labels # target_classes else: c_minimize = labels # labels c_maximize = best_other_classes(logits, labels) loss = logits[rows, c_minimize] - logits[rows, c_maximize] return -loss.sum(), (logits, loss) x, restore_type = ep.astensor_(inputs) del inputs, criterion, kwargs N = len(x) # start from initialization points/attack if starting_points is not None: x1 = starting_points else: if self.init_attack is not None: x1 = self.init_attack.run(model, x, criterion_) else: x1 = None # if initial points or initialization attacks are provided, # search for the boundary if x1 is not None: is_adv = get_is_adversarial(criterion_, model) assert is_adv(x1).all() lower_bound = ep.zeros(x, shape=(N, )) upper_bound = ep.ones(x, shape=(N, )) for _ in range(self.binary_search_steps): epsilons = (lower_bound + upper_bound) / 2 mid_points = self.mid_points(x, x1, epsilons, model.bounds) is_advs = is_adv(mid_points) lower_bound = ep.where(is_advs, lower_bound, epsilons) upper_bound = ep.where(is_advs, epsilons, upper_bound) starting_points = self.mid_points(x, x1, upper_bound, model.bounds) delta = starting_points - x else: # start from x0 delta = ep.zeros_like(x) if classes.shape != (N, ): name = "target_classes" if targeted else "labels" raise ValueError( f"expected {name} to have shape ({N},), got {classes.shape}") min_, max_ = model.bounds rows = range(N) grad_and_logits = ep.value_and_grad_fn(x, loss_fn, has_aux=True) if self.p != 0: epsilon = ep.inf * ep.ones(x, len(x)) else: epsilon = ep.ones(x, len(x)) if x1 is None \ else ep.norms.l0(flatten(delta), axis=-1) if self.p != 0: worst_norm = ep.norms.lp(flatten(ep.maximum(x - min_, max_ - x)), p=self.p, axis=-1) else: worst_norm = flatten(ep.ones_like(x)).bool().sum(axis=1).float32() best_lp = worst_norm best_delta = delta adv_found = ep.zeros(x, len(x)).bool() for i in range(self.steps): # perform cosine annealing of learning rates stepsize = (self.min_stepsize + (self.max_stepsize - self.min_stepsize) * (1 + math.cos(math.pi * i / self.steps)) / 2) gamma = (0.001 + (self.gamma - 0.001) * (1 + math.cos(math.pi * (i / self.steps))) / 2) x_adv = x + delta loss, (logits, loss_batch), gradients = grad_and_logits(x_adv, classes) is_adversarial = criterion_(x_adv, logits) lp = ep.norms.lp(flatten(delta), p=self.p, axis=-1) is_smaller = lp <= best_lp is_both = ep.logical_and(is_adversarial, is_smaller) adv_found = ep.logical_or(adv_found, is_adversarial) best_lp = ep.where(is_both, lp, best_lp) best_delta = ep.where(atleast_kd(is_both, x.ndim), delta, best_delta) # update epsilon if self.p != 0: distance_to_boundary = abs(loss_batch) / ep.norms.lp( flatten(gradients), p=self.dual, axis=-1) epsilon = ep.where( is_adversarial, ep.minimum( epsilon * (1 - gamma), ep.norms.lp(flatten(best_delta), p=self.p, axis=-1)), ep.where( adv_found, epsilon * (1 + gamma), ep.norms.lp(flatten(delta), p=self.p, axis=-1) + distance_to_boundary)) else: epsilon = ep.where( is_adversarial, ep.minimum( ep.minimum(epsilon - 1, (epsilon * (1 - gamma)).astype(int).astype( epsilon.dtype)), ep.norms.lp(flatten(best_delta), p=self.p, axis=-1)), ep.maximum(epsilon + 1, (epsilon * (1 + gamma)).astype(int).astype( epsilon.dtype))) epsilon = ep.maximum(0, epsilon).astype(epsilon.dtype) # clip epsilon epsilon = ep.minimum(epsilon, worst_norm) # computes normalized gradient update grad_ = self.normalize(gradients, x=x, bounds=model.bounds) * stepsize # do step delta = delta + grad_ # project according to the given norm delta = self.project(x=x + delta, x0=x, epsilon=epsilon) - x # clip to valid bounds delta = ep.clip(x + delta, *model.bounds) - x x_adv = x + best_delta return restore_type(x_adv)