def __init__( self, predict_fn: Union[Callable, tf.keras.Model, 'keras.Model'], shape: Tuple[int, ...], distance_fn: str = 'l1', target_proba: float = 1.0, target_class: Union[str, int] = 'other', max_iter: int = 1000, early_stop: int = 50, lam_init: float = 1e-1, max_lam_steps: int = 10, tol: float = 0.05, learning_rate_init=0.1, feature_range: Union[Tuple, str] = (-1e10, 1e10), eps: Union[float, np.ndarray] = 0.01, # feature-wise epsilons init: str = 'identity', decay: bool = True, write_dir: str = None, debug: bool = False, sess: tf.Session = None) -> None: """ Initialize counterfactual explanation method based on Wachter et al. (2017) Parameters ---------- predict_fn Keras or TensorFlow model or any other model's prediction function returning class probabilities shape Shape of input data starting with batch size distance_fn Distance function to use in the loss term target_proba Target probability for the counterfactual to reach target_class Target class for the counterfactual to reach, one of 'other', 'same' or an integer denoting desired class membership for the counterfactual instance max_iter Maximum number of interations to run the gradient descent for (inner loop) early_stop Number of steps after which to terminate gradient descent if all or none of found instances are solutions lam_init Initial regularization constant for the prediction part of the Wachter loss max_lam_steps Maximum number of times to adjust the regularization constant (outer loop) before terminating the search tol Tolerance for the counterfactual target probability learning_rate_init Initial learning rate for each outer loop of lambda feature_range Tuple with min and max ranges to allow for perturbed instances. Min and max ranges can be floats or numpy arrays with dimension (1 x nb of features) for feature-wise ranges eps Gradient step sizes used in calculating numerical gradients, defaults to a single value for all features, but can be passed an array for feature-wise step sizes init Initialization method for the search of counterfactuals, currently must be 'identity' decay Flag to decay learning rate to zero for each outer loop over lambda write_dir Directory to write Tensorboard files to debug Flag to write Tensorboard summaries for debugging sess Optional Tensorflow session that will be used if passed instead of creating or inferring one internally """ super().__init__(meta=copy.deepcopy(DEFAULT_META_CF)) # get params for storage in meta params = locals() remove = ['self', 'predict_fn', 'sess', '__class__'] for key in remove: params.pop(key) self.meta['params'].update(params) self.data_shape = shape self.batch_size = shape[0] self.target_class = target_class # options for the optimizer self.max_iter = max_iter self.lam_init = lam_init self.tol = tol self.max_lam_steps = max_lam_steps self.early_stop = early_stop self.eps = eps self.init = init self.feature_range = feature_range self.target_proba_arr = target_proba * np.ones(self.batch_size) self.debug = debug # check if the passed object is a model and get session is_model, is_keras, model_sess = _check_keras_or_tf(predict_fn) self.meta['params'].update(is_model=is_model, is_keras=is_keras) # if session provided, use it if isinstance(sess, tf.Session): self.sess = sess else: self.sess = model_sess if is_model: # Keras or TF model self.model = True self.predict_fn = predict_fn.predict # type: ignore # array function self.predict_tn = predict_fn # tensor function else: # black-box model self.predict_fn = predict_fn self.predict_tn = None self.model = False self.n_classes = self.predict_fn(np.zeros(shape)).shape[1] # flag to keep track if explainer is fit or not self.fitted = False # set up graph session for optimization (counterfactual search) with tf.variable_scope('cf_search', reuse=tf.AUTO_REUSE): # define variables for original and candidate counterfactual instances, target labels and lambda self.orig = tf.get_variable('original', shape=shape, dtype=tf.float32) self.cf = tf.get_variable( 'counterfactual', shape=shape, dtype=tf.float32, constraint=lambda x: tf.clip_by_value(x, feature_range[0], feature_range[1])) # the following will be a 1-hot encoding of the target class (as predicted by the model) self.target = tf.get_variable('target', shape=(self.batch_size, self.n_classes), dtype=tf.float32) # constant target probability and global step variable self.target_proba = tf.constant(target_proba * np.ones(self.batch_size), dtype=tf.float32, name='target_proba') self.global_step = tf.Variable(0.0, trainable=False, name='global_step') # lambda hyperparameter - placeholder instead of variable as annealed in first epoch self.lam = tf.placeholder(tf.float32, shape=(self.batch_size), name='lam') # define placeholders that will be assigned to relevant variables self.assign_orig = tf.placeholder(tf.float32, shape, name='assing_orig') self.assign_cf = tf.placeholder(tf.float32, shape, name='assign_cf') self.assign_target = tf.placeholder(tf.float32, shape=(self.batch_size, self.n_classes), name='assign_target') # L1 distance and MAD constants # TODO: MADs? ax_sum = list(np.arange(1, len(self.data_shape))) if distance_fn == 'l1': self.dist = tf.reduce_sum(tf.abs(self.cf - self.orig), axis=ax_sum, name='l1') else: logger.exception('Distance metric %s not supported', distance_fn) raise ValueError # distance loss self.loss_dist = self.lam * self.dist # prediction loss if not self.model: # will need to calculate gradients numerically self.loss_opt = self.loss_dist else: # autograd gradients throughout self.pred_proba = self.predict_tn(self.cf) # 3 cases for target_class if target_class == 'same': self.pred_proba_class = tf.reduce_max( self.target * self.pred_proba, 1) elif target_class == 'other': self.pred_proba_class = tf.reduce_max( (1 - self.target) * self.pred_proba, 1) elif target_class in range(self.n_classes): # if class is specified, this is known in advance self.pred_proba_class = tf.reduce_max( tf.one_hot( target_class, self.n_classes, dtype=tf.float32) * self.pred_proba, 1) else: logger.exception('Target class %s unknown', target_class) raise ValueError self.loss_pred = tf.square(self.pred_proba_class - self.target_proba) self.loss_opt = self.loss_pred + self.loss_dist # optimizer if decay: self.learning_rate = tf.train.polynomial_decay( learning_rate_init, self.global_step, self.max_iter, 0.0, power=1.0) else: self.learning_rate = tf.convert_to_tensor(learning_rate_init) # TODO optional argument to change type, learning rate scheduler opt = tf.train.AdamOptimizer(self.learning_rate) # first compute gradients, then apply them self.compute_grads = opt.compute_gradients(self.loss_opt, var_list=[self.cf]) self.grad_ph = tf.placeholder(shape=shape, dtype=tf.float32, name='grad_cf') grad_and_var = [(self.grad_ph, self.cf)] self.apply_grads = opt.apply_gradients( grad_and_var, global_step=self.global_step) # variables to initialize self.setup = [] # type: list self.setup.append(self.orig.assign(self.assign_orig)) self.setup.append(self.cf.assign(self.assign_cf)) self.setup.append(self.target.assign(self.assign_target)) self.tf_init = tf.variables_initializer(var_list=tf.global_variables( scope='cf_search')) # tensorboard if write_dir is not None: self.writer = tf.summary.FileWriter(write_dir, tf.get_default_graph()) self.writer.add_graph(tf.get_default_graph()) # return templates self.instance_dict = dict.fromkeys( ['X', 'distance', 'lambda', 'index', 'class', 'proba', 'loss']) self.return_dict = copy.deepcopy(DEFAULT_DATA_CF) self.return_dict['all'] = {i: [] for i in range(self.max_lam_steps)}
def __init__(self, predict: Union[Callable, tf.keras.Model, 'keras.Model'], shape: tuple, kappa: float = 0., beta: float = .1, feature_range: tuple = (-1e10, 1e10), gamma: float = 0., ae_model: Union[tf.keras.Model, 'keras.Model'] = None, enc_model: Union[tf.keras.Model, 'keras.Model'] = None, theta: float = 0., use_kdtree: bool = False, learning_rate_init: float = 1e-2, max_iterations: int = 1000, c_init: float = 10., c_steps: int = 10, eps: tuple = (1e-3, 1e-3), clip: tuple = (-1000., 1000.), update_num_grad: int = 1, write_dir: str = None, sess: tf.compat.v1.Session = None) -> None: """ Initialize prototypical counterfactual method. Parameters ---------- predict Keras or TensorFlow model or any other model's prediction function returning class probabilities shape Shape of input data starting with batch size kappa Confidence parameter for the attack loss term beta Regularization constant for L1 loss term feature_range Tuple with min and max ranges to allow for perturbed instances. Min and max ranges can be floats or numpy arrays with dimension (1x nb of features) for feature-wise ranges gamma Regularization constant for optional auto-encoder loss term ae_model Optional auto-encoder model used for loss regularization enc_model Optional encoder model used to guide instance perturbations towards a class prototype theta Constant for the prototype search loss term use_kdtree Whether to use k-d trees for the prototype loss term if no encoder is available learning_rate_init Initial learning rate of optimizer max_iterations Maximum number of iterations for finding a counterfactual c_init Initial value to scale the attack loss term c_steps Number of iterations to adjust the constant scaling the attack loss term eps If numerical gradients are used to compute dL/dx = (dL/dp) * (dp/dx), then eps[0] is used to calculate dL/dp and eps[1] is used for dp/dx. eps[0] and eps[1] can be a combination of float values and numpy arrays. For eps[0], the array dimension should be (1x nb of prediction categories) and for eps[1] it should be (1x nb of features) clip Tuple with min and max clip ranges for both the numerical gradients and the gradients obtained from the TensorFlow graph update_num_grad If numerical gradients are used, they will be updated every update_num_grad iterations write_dir Directory to write tensorboard files to sess Optional Tensorflow session that will be used if passed instead of creating or inferring one internally """ self.predict = predict # check whether the model, encoder and auto-encoder are Keras or TF models and get session is_model, is_model_keras, model_sess = _check_keras_or_tf(predict) is_ae, is_ae_keras, ae_sess = _check_keras_or_tf(ae_model) is_enc, is_enc_keras, enc_sess = _check_keras_or_tf(enc_model) # TODO: check ae, enc and model are all compatible # if session provided, use it if isinstance(sess, tf.compat.v1.Session): self.sess = sess else: self.sess = model_sess if is_model: # Keras or TF model self.model = True self.classes = self.sess.run( self.predict( tf.convert_to_tensor(np.zeros(shape), dtype=tf.float32))).shape[1] else: # black-box model self.model = False self.classes = self.predict(np.zeros(shape)).shape[1] if is_enc: self.enc_model = True else: self.enc_model = False if is_ae: self.ae_model = True else: self.ae_model = False if use_kdtree and self.enc_model: logger.warning( 'Both an encoder and k-d trees enabled. Using the encoder for the prototype loss term.' ) if use_kdtree or self.enc_model: self.enc_or_kdtree = True else: self.enc_or_kdtree = False self.shape = shape self.kappa = kappa self.beta = beta self.gamma = gamma self.theta = theta self.ae = ae_model self.enc = enc_model self.use_kdtree = use_kdtree self.batch_size = shape[0] self.max_iterations = max_iterations self.c_init = c_init self.c_steps = c_steps self.update_num_grad = update_num_grad self.eps = eps self.clip = clip self.write_dir = write_dir # define tf variables for original and perturbed instances, and target labels self.orig = tf.Variable(np.zeros(shape), dtype=tf.float32, name='orig') self.adv = tf.Variable(np.zeros(shape), dtype=tf.float32, name='adv') self.adv_s = tf.Variable(np.zeros(shape), dtype=tf.float32, name='adv_s') self.target = tf.Variable(np.zeros((self.batch_size, self.classes)), dtype=tf.float32, name='target') # variable for target class proto if self.enc_model: self.shape_enc = self.enc.predict(np.zeros(shape)).shape else: self.shape_enc = shape self.target_proto = tf.Variable(np.zeros(self.shape_enc), dtype=tf.float32, name='target_proto') # define tf variable for constant used in FISTA optimization self.const = tf.Variable(np.zeros(self.batch_size), dtype=tf.float32, name='const') self.global_step = tf.Variable(0.0, trainable=False, name='global_step') # define placeholders that will be assigned to relevant variables self.assign_orig = tf.placeholder(tf.float32, shape, name='assign_orig') self.assign_adv = tf.placeholder(tf.float32, shape, name='assign_adv') self.assign_adv_s = tf.placeholder(tf.float32, shape, name='assign_adv_s') self.assign_target = tf.placeholder(tf.float32, (self.batch_size, self.classes), name='assign_target') self.assign_const = tf.placeholder(tf.float32, [self.batch_size], name='assign_const') self.assign_target_proto = tf.placeholder(tf.float32, self.shape_enc, name='assign_target_proto') # define conditions and values for element-wise shrinkage thresholding with tf.name_scope('shrinkage_thresholding') as scope: cond = [ tf.cast( tf.greater(tf.subtract(self.adv_s, self.orig), self.beta), tf.float32), tf.cast( tf.less_equal(tf.abs(tf.subtract(self.adv_s, self.orig)), self.beta), tf.float32), tf.cast( tf.less(tf.subtract(self.adv_s, self.orig), tf.negative(self.beta)), tf.float32) ] upper = tf.minimum(tf.subtract(self.adv_s, self.beta), tf.cast(feature_range[1], tf.float32)) lower = tf.maximum(tf.add(self.adv_s, self.beta), tf.cast(feature_range[0], tf.float32)) self.assign_adv = tf.multiply(cond[0], upper) + tf.multiply( cond[1], self.orig) + tf.multiply(cond[2], lower) # perturbation update and vector projection on correct feature range set with tf.name_scope('perturbation_y') as scope: self.zt = tf.divide(self.global_step, self.global_step + tf.cast(3, tf.float32)) self.assign_adv_s = self.assign_adv + tf.multiply( self.zt, self.assign_adv - self.adv) # map to feature space self.assign_adv_s = tf.minimum( self.assign_adv_s, tf.cast(feature_range[1], tf.float32)) self.assign_adv_s = tf.maximum( self.assign_adv_s, tf.cast(feature_range[0], tf.float32)) # assign counterfactual of step k+1 to k with tf.name_scope('update_adv') as scope: self.adv_updater = tf.assign(self.adv, self.assign_adv) self.adv_updater_s = tf.assign(self.adv_s, self.assign_adv_s) # from perturbed instance, derive deviation delta with tf.name_scope('update_delta') as scope: self.delta = self.orig - self.adv self.delta_s = self.orig - self.adv_s # define L1 and L2 loss terms; L1+L2 is later used as an optimization constraint for FISTA ax_sum = list(np.arange(1, len(shape))) with tf.name_scope('loss_l1_l2') as scope: self.l2 = tf.reduce_sum(tf.square(self.delta), axis=ax_sum) self.l2_s = tf.reduce_sum(tf.square(self.delta_s), axis=ax_sum) self.l1 = tf.reduce_sum(tf.abs(self.delta), axis=ax_sum) self.l1_s = tf.reduce_sum(tf.abs(self.delta_s), axis=ax_sum) self.l1_l2 = self.l2 + tf.multiply(self.l1, self.beta) self.l1_l2_s = self.l2_s + tf.multiply(self.l1_s, self.beta) # sum losses self.loss_l1 = tf.reduce_sum(self.l1) self.loss_l1_s = tf.reduce_sum(self.l1_s) self.loss_l2 = tf.reduce_sum(self.l2) self.loss_l2_s = tf.reduce_sum(self.l2_s) with tf.name_scope('loss_ae') as scope: # gamma * AE loss if self.ae_model: self.loss_ae = self.gamma * tf.square( tf.norm(self.ae(self.adv) - self.adv)) self.loss_ae_s = self.gamma * tf.square( tf.norm(self.ae(self.adv_s) - self.adv_s)) else: # no auto-encoder available self.loss_ae = tf.constant(0.) self.loss_ae_s = tf.constant(0.) with tf.name_scope('loss_attack') as scope: if not self.model: self.loss_attack = tf.placeholder(tf.float32) elif self.c_init == 0. and self.c_steps == 1: # prediction loss term not used # make predictions on perturbed instance self.pred_proba = self.predict(self.adv) self.pred_proba_s = self.predict(self.adv_s) self.loss_attack = tf.constant(0.) self.loss_attack_s = tf.constant(0.) else: # make predictions on perturbed instance self.pred_proba = self.predict(self.adv) self.pred_proba_s = self.predict(self.adv_s) # probability of target label prediction self.target_proba = tf.reduce_sum( self.target * self.pred_proba, 1) target_proba_s = tf.reduce_sum(self.target * self.pred_proba_s, 1) # max probability of non target label prediction self.nontarget_proba_max = tf.reduce_max( (1 - self.target) * self.pred_proba - (self.target * 10000), 1) nontarget_proba_max_s = tf.reduce_max( (1 - self.target) * self.pred_proba_s - (self.target * 10000), 1) # loss term f(x,d) loss_attack = tf.maximum( 0.0, -self.nontarget_proba_max + self.target_proba + self.kappa) loss_attack_s = tf.maximum( 0.0, -nontarget_proba_max_s + target_proba_s + self.kappa) # c * f(x,d) self.loss_attack = tf.reduce_sum(self.const * loss_attack) self.loss_attack_s = tf.reduce_sum(self.const * loss_attack_s) with tf.name_scope('loss_prototype') as scope: if self.enc_model: self.loss_proto = self.theta * tf.square( tf.norm(self.enc(self.adv) - self.target_proto)) self.loss_proto_s = self.theta * tf.square( tf.norm(self.enc(self.adv_s) - self.target_proto)) elif self.use_kdtree: self.loss_proto = self.theta * tf.square( tf.norm(self.adv - self.target_proto)) self.loss_proto_s = self.theta * tf.square( tf.norm(self.adv_s - self.target_proto)) else: # no encoder available and no k-d trees used self.loss_proto = tf.constant(0.) self.loss_proto_s = tf.constant(0.) with tf.name_scope('loss_combined') as scope: # no need for L1 term in loss to optimize when using FISTA if self.model: self.loss_opt = self.loss_attack_s + self.loss_l2_s + self.loss_ae_s + self.loss_proto_s else: # separate numerical computation of loss attack gradient self.loss_opt = self.loss_l2_s + self.loss_ae_s + self.loss_proto_s # add L1 term to overall loss; this is not the loss that will be directly optimized self.loss_total = (self.loss_attack + self.loss_l2 + self.loss_ae + tf.multiply(self.beta, self.loss_l1) + self.loss_proto) with tf.name_scope('training') as scope: self.learning_rate = tf.train.polynomial_decay(learning_rate_init, self.global_step, self.max_iterations, 0, power=0.5) optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) start_vars = set(x.name for x in tf.global_variables()) # first compute, then apply grads self.compute_grads = optimizer.compute_gradients( self.loss_opt, var_list=[self.adv_s]) self.grad_ph = tf.placeholder(tf.float32, name='grad_adv_s') var = [ tvar for tvar in tf.trainable_variables() if tvar.name.startswith('adv_s') ][-1] # get the last in # case explainer is re-initialized and a new graph is created grad_and_var = [(self.grad_ph, var)] self.apply_grads = optimizer.apply_gradients( grad_and_var, global_step=self.global_step) end_vars = tf.global_variables() new_vars = [x for x in end_vars if x.name not in start_vars] # variables to initialize self.setup = [] # type: list self.setup.append(self.orig.assign(self.assign_orig)) self.setup.append(self.target.assign(self.assign_target)) self.setup.append(self.const.assign(self.assign_const)) self.setup.append(self.adv.assign(self.assign_adv)) self.setup.append(self.adv_s.assign(self.assign_adv_s)) self.setup.append(self.target_proto.assign(self.assign_target_proto)) self.init = tf.variables_initializer( var_list=[self.global_step] + [self.adv_s] + [self.adv] + new_vars) if self.write_dir is not None: self.writer = tf.summary.FileWriter(write_dir, tf.get_default_graph()) self.writer.add_graph(tf.get_default_graph()) else: self.writer = None
def __init__(self, predict: Union[Callable, tf.keras.Model, 'keras.Model'], mode: str, shape: tuple, kappa: float = 0., beta: float = .1, feature_range: tuple = (-1e10, 1e10), gamma: float = 0., ae_model: Union[tf.keras.Model, 'keras.Model'] = None, learning_rate_init: float = 1e-2, max_iterations: int = 1000, c_init: float = 10., c_steps: int = 10, eps: tuple = (1e-3, 1e-3), clip: tuple = (-100., 100.), update_num_grad: int = 1, no_info_val: Union[float, np.ndarray] = None, write_dir: str = None, sess: tf.Session = None) -> None: """ Initialize contrastive explanation method. Paper: https://arxiv.org/abs/1802.07623 Parameters ---------- predict Keras or TensorFlow model or any other model's prediction function returning class probabilities mode Find pertinant negatives ('PN') or pertinant positives ('PP') shape Shape of input data starting with batch size kappa Confidence parameter for the attack loss term beta Regularization constant for L1 loss term feature_range Tuple with min and max ranges to allow for perturbed instances. Min and max ranges can be floats or numpy arrays with dimension (1x nb of features) for feature-wise ranges gamma Regularization constant for optional auto-encoder loss term ae_model Optional auto-encoder model used for loss regularization learning_rate_init Initial learning rate of optimizer max_iterations Maximum number of iterations for finding a PN or PP c_init Initial value to scale the attack loss term c_steps Number of iterations to adjust the constant scaling the attack loss term eps If numerical gradients are used to compute dL/dx = (dL/dp) * (dp/dx), then eps[0] is used to calculate dL/dp and eps[1] is used for dp/dx. eps[0] and eps[1] can be a combination of float values and numpy arrays. For eps[0], the array dimension should be (1x nb of prediction categories) and for eps[1] it should be (1x nb of features) clip Tuple with min and max clip ranges for both the numerical gradients and the gradients obtained from the TensorFlow graph update_num_grad If numerical gradients are used, they will be updated every update_num_grad iterations no_info_val Global or feature-wise value considered as containing no information write_dir Directory to write tensorboard files to sess Optional Tensorflow session that will be used if passed instead of creating or inferring one internally """ super().__init__(meta=copy.deepcopy(DEFAULT_META_CEM)) # get params for storage in meta params = locals() remove = ['self', 'predict', 'ae_model', 'sess', '__class__'] for key in remove: params.pop(key) self.meta['params'].update(params) self.predict = predict # check whether the model and the auto-encoder are Keras or TF models and get session is_model, is_model_keras, model_sess = _check_keras_or_tf(predict) is_ae, is_ae_keras, ae_sess = _check_keras_or_tf(ae_model) # TODO: check ae and model are compatible self.meta['params'].update(is_model=is_model, is_model_keras=is_model_keras, is_ae=is_ae, is_ae_keras=is_ae_keras) # if session provided, use it if isinstance(sess, tf.Session): self.sess = sess else: self.sess = model_sess if is_model: # Keras or TF model self.model = True classes = self.sess.run(self.predict(tf.convert_to_tensor(np.zeros(shape), dtype=tf.float32))).shape[1] else: self.model = False classes = self.predict(np.zeros(shape)).shape[1] self.mode = mode self.shape = shape self.kappa = kappa self.beta = beta self.gamma = gamma self.ae = ae_model self.batch_size = shape[0] self.max_iterations = max_iterations self.c_init = c_init self.c_steps = c_steps self.update_num_grad = update_num_grad self.eps = eps self.clip = clip self.write_dir = write_dir if type(no_info_val) == float: self.no_info_val = np.ones(shape) * no_info_val else: self.no_info_val = no_info_val # values regarded as containing no information # PNs will deviate away from these values while PPs will gravitate towards them self.no_info = tf.Variable(np.zeros(shape), dtype=tf.float32, name='no_info') # define tf variables for original and perturbed instances, and target labels self.orig = tf.Variable(np.zeros(shape), dtype=tf.float32, name='orig') self.adv = tf.Variable(np.zeros(shape), dtype=tf.float32, name='adv') # delta(k) self.adv_s = tf.Variable(np.zeros(shape), dtype=tf.float32, name='adv_s') # y(k) self.target = tf.Variable(np.zeros((self.batch_size, classes)), dtype=tf.float32, name='target') # define tf variable for constant used in FISTA optimization self.const = tf.Variable(np.zeros(self.batch_size), dtype=tf.float32, name='const') self.global_step = tf.Variable(0.0, trainable=False, name='global_step') # define placeholders that will be assigned to relevant variables self.assign_orig = tf.placeholder(tf.float32, shape, name='assign_orig') self.assign_adv = tf.placeholder(tf.float32, shape, name='assign_adv') self.assign_adv_s = tf.placeholder(tf.float32, shape, name='assign_adv_s') self.assign_target = tf.placeholder(tf.float32, (self.batch_size, classes), name='assign_target') self.assign_const = tf.placeholder(tf.float32, [self.batch_size], name='assign_const') self.assign_no_info = tf.placeholder(tf.float32, shape, name='assign_no_info') # define conditions and values for element-wise shrinkage thresholding (eq.7) with tf.name_scope('shrinkage_thresholding') as scope: cond = [tf.cast(tf.greater(tf.subtract(self.adv_s, self.orig), self.beta), tf.float32), tf.cast(tf.less_equal(tf.abs(tf.subtract(self.adv_s, self.orig)), self.beta), tf.float32), tf.cast(tf.less(tf.subtract(self.adv_s, self.orig), tf.negative(self.beta)), tf.float32)] upper = tf.minimum(tf.subtract(self.adv_s, self.beta), tf.cast(feature_range[1], tf.float32)) lower = tf.maximum(tf.add(self.adv_s, self.beta), tf.cast(feature_range[0], tf.float32)) self.assign_adv = tf.multiply(cond[0], upper) + tf.multiply(cond[1], self.orig) + tf.multiply(cond[2], lower) # perturbation update for delta and vector projection on correct set depending on PP or PN (eq.5) # delta(k) = adv; delta(k+1) = assign_adv with tf.name_scope('perturbation_delta') as scope: proj_d = [tf.cast(tf.greater(tf.abs(tf.subtract(self.assign_adv, self.no_info)), tf.abs(tf.subtract(self.orig, self.no_info))), tf.float32), tf.cast(tf.less_equal(tf.abs(tf.subtract(self.assign_adv, self.no_info)), tf.abs(tf.subtract(self.orig, self.no_info))), tf.float32)] if self.mode == "PP": self.assign_adv = tf.multiply(proj_d[1], self.assign_adv) + tf.multiply(proj_d[0], self.orig) elif self.mode == "PN": self.assign_adv = tf.multiply(proj_d[0], self.assign_adv) + tf.multiply(proj_d[1], self.orig) # perturbation update and vector projection on correct set for y: y(k+1) = assign_adv_s (eq.6) with tf.name_scope('perturbation_y') as scope: self.zt = tf.divide(self.global_step, self.global_step + tf.cast(3, tf.float32)) # k/(k+3) in (eq.6) self.assign_adv_s = self.assign_adv + tf.multiply(self.zt, self.assign_adv - self.adv) proj_d_s = [tf.cast(tf.greater(tf.abs(tf.subtract(self.assign_adv_s, self.no_info)), tf.abs(tf.subtract(self.orig, self.no_info))), tf.float32), tf.cast(tf.less_equal(tf.abs(tf.subtract(self.assign_adv_s, self.no_info)), tf.abs(tf.subtract(self.orig, self.no_info))), tf.float32)] if self.mode == "PP": self.assign_adv_s = tf.multiply(proj_d_s[1], self.assign_adv_s) + tf.multiply(proj_d_s[0], self.orig) elif self.mode == "PN": self.assign_adv_s = tf.multiply(proj_d_s[0], self.assign_adv_s) + tf.multiply(proj_d_s[1], self.orig) # delta(k) <- delta(k+1); y(k) <- y(k+1) with tf.name_scope('update_adv') as scope: self.adv_updater = tf.assign(self.adv, self.assign_adv) self.adv_updater_s = tf.assign(self.adv_s, self.assign_adv_s) # from perturbed instance, derive deviation delta with tf.name_scope('update_delta') as scope: self.delta = self.orig - self.adv self.delta_s = self.orig - self.adv_s # define L1 and L2 loss terms; L1+L2 is later used as an optimization constraint for FISTA ax_sum = list(np.arange(1, len(shape))) with tf.name_scope('loss_l1_l2') as scope: self.l2 = tf.reduce_sum(tf.square(self.delta), axis=ax_sum) self.l2_s = tf.reduce_sum(tf.square(self.delta_s), axis=ax_sum) self.l1 = tf.reduce_sum(tf.abs(self.delta), axis=ax_sum) self.l1_s = tf.reduce_sum(tf.abs(self.delta_s), axis=ax_sum) self.l1_l2 = self.l2 + tf.multiply(self.l1, self.beta) self.l1_l2_s = self.l2_s + tf.multiply(self.l1_s, self.beta) # sum losses self.loss_l1 = tf.reduce_sum(self.l1) self.loss_l1_s = tf.reduce_sum(self.l1_s) self.loss_l2 = tf.reduce_sum(self.l2) self.loss_l2_s = tf.reduce_sum(self.l2_s) with tf.name_scope('loss_ae') as scope: # gamma * AE loss if self.mode == "PP" and callable(self.ae): self.loss_ae = self.gamma * tf.square(tf.norm(self.ae(self.delta) - self.delta)) self.loss_ae_s = self.gamma * tf.square(tf.norm(self.ae(self.delta_s) - self.delta_s)) elif self.mode == "PN" and callable(self.ae): self.loss_ae = self.gamma * tf.square(tf.norm(self.ae(self.adv) - self.adv)) self.loss_ae_s = self.gamma * tf.square(tf.norm(self.ae(self.adv_s) - self.adv_s)) else: # no auto-encoder available self.loss_ae = tf.constant(0.) self.loss_ae_s = tf.constant(0.) with tf.name_scope('loss_attack') as scope: if not self.model: self.loss_attack = tf.placeholder(tf.float32) else: # make predictions on perturbed instance (PN) or delta (PP) if self.mode == "PP": self.pred_proba = self.predict(self.delta) self.pred_proba_s = self.predict(self.delta_s) elif self.mode == "PN": self.pred_proba = self.predict(self.adv) self.pred_proba_s = self.predict(self.adv_s) # probability of target label prediction self.target_proba = tf.reduce_sum(self.target * self.pred_proba, 1) target_proba_s = tf.reduce_sum(self.target * self.pred_proba_s, 1) # max probability of non target label prediction self.nontarget_proba_max = tf.reduce_max((1 - self.target) * self.pred_proba - (self.target * 10000), 1) nontarget_proba_max_s = tf.reduce_max((1 - self.target) * self.pred_proba_s - (self.target * 10000), 1) # loss term f(x,d) for PP (eq.4) and PN (eq.2) if self.mode == "PP": loss_attack = tf.maximum(0.0, self.nontarget_proba_max - self.target_proba + self.kappa) loss_attack_s = tf.maximum(0.0, nontarget_proba_max_s - target_proba_s + self.kappa) elif self.mode == "PN": loss_attack = tf.maximum(0.0, -self.nontarget_proba_max + self.target_proba + self.kappa) loss_attack_s = tf.maximum(0.0, -nontarget_proba_max_s + target_proba_s + self.kappa) # c * f(x,d) self.loss_attack = tf.reduce_sum(self.const * loss_attack) self.loss_attack_s = tf.reduce_sum(self.const * loss_attack_s) with tf.name_scope('loss_combined') as scope: # no need for L1 term in loss to optimize when using FISTA if self.model: self.loss_opt = self.loss_attack_s + self.loss_l2_s + self.loss_ae_s else: # separate numerical computation of loss attack gradient self.loss_opt = self.loss_l2_s + self.loss_ae_s # add L1 term to overall loss; this is not the loss that will be directly optimized self.loss_total = self.loss_attack + self.loss_l2 + self.loss_ae + tf.multiply(self.beta, self.loss_l1) with tf.name_scope('training') as scope: self.learning_rate = tf.train.polynomial_decay(learning_rate_init, self.global_step, self.max_iterations, 0, power=0.5) optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) start_vars = set(x.name for x in tf.global_variables()) # first compute, then apply grads self.compute_grads = optimizer.compute_gradients(self.loss_opt, var_list=[self.adv_s]) self.grad_ph = tf.placeholder(tf.float32, name='grad_adv_s') var = [tvar for tvar in tf.trainable_variables() if tvar.name.startswith('adv_s')][-1] # get the last in # case explainer is re-initialized and a new graph is created grad_and_var = [(self.grad_ph, var)] self.apply_grads = optimizer.apply_gradients(grad_and_var, global_step=self.global_step) end_vars = tf.global_variables() new_vars = [x for x in end_vars if x.name not in start_vars] # variables to initialize self.setup = [] # type: list self.setup.append(self.orig.assign(self.assign_orig)) self.setup.append(self.target.assign(self.assign_target)) self.setup.append(self.const.assign(self.assign_const)) self.setup.append(self.adv.assign(self.assign_adv)) self.setup.append(self.adv_s.assign(self.assign_adv_s)) self.setup.append(self.no_info.assign(self.assign_no_info)) self.init = tf.variables_initializer(var_list=[self.global_step] + [self.adv_s] + [self.adv] + new_vars) if self.write_dir is not None: writer = tf.summary.FileWriter(write_dir, tf.get_default_graph()) writer.add_graph(tf.get_default_graph())
def test_blackbox_check_keras_or_tf_no_keras_import(): with mock.patch.dict('sys.modules', {'keras': None}): is_model, is_keras, sess = _check_keras_or_tf(blackbox_model) assert not is_model assert not is_keras
def test_keras_bb_check_keras_or_tf(): is_model, is_keras, sess = _check_keras_or_tf(blackbox_keras) assert not is_model assert not is_keras
def test_tf_check_keras_or_tf(): is_model, is_keras, sess = _check_keras_or_tf(tf_model) assert is_model assert not is_keras