def __init__(self, model, targeted=False, step_size_iter=0.05, max_perturbation=0.3, n_iterations=10, norm_order=np.inf, rand_init=None, rand_minmax=0.3, clip_min=None, clip_max=None, sanity_checks=True): super().__init__(model=model, clip_min=clip_min, clip_max=clip_max) self._targeted = targeted self._step_size_iter = step_size_iter self._max_perturbation = max_perturbation self._n_iterations = n_iterations self._norm_order = norm_order self._rand_init = rand_init self._rand_minmax = rand_minmax self._sanity_checks = sanity_checks with self.graph.as_default(): self._method = ProjectedGradientDescent( self._model, sess=self.session, eps=self._max_perturbation, eps_iter=self._step_size_iter, nb_iter=self._n_iterations, ord=self._norm_order, rand_init=self._rand_init, clip_min=self._clip_min, clip_max=self._clip_max, sanity_checks=self._sanity_checks)
def get_at_loss(sess, x, y, model, eps, eps_iter, iterations): # Set up PGD attack graph using Cleverhans library pgd_params = { 'ord': np.inf, 'y': y, 'eps': eps / 255, 'eps_iter': eps_iter / 255, 'nb_iter': iterations, 'rand_init': True, 'rand_minmax': eps / 255, 'clip_min': 0., 'clip_max': 1., 'sanity_checks': True } pgd = ProjectedGradientDescent(model, sess=sess) adv_x = pgd.generate(x, **pgd_params) adv_logits = model.get_logits(adv_x) # Add summary for adversarial training images with tf.device('/gpu:0'): with tf.name_scope('Adversarial-Image-Summaries'): tf.summary.image('adv-input', adv_x, max_outputs=2, family='Adversarial-Training', collections=['training']) adv_loss = tf.nn.softmax_cross_entropy_with_logits(logits=adv_logits, labels=y) adv_loss = tf.reduce_mean(adv_loss) return adv_loss, adv_logits
def __init__(self, dataset, model): super(PGDAdaptor, self).__init__(dataset, model) self.config = tf.ConfigProto(gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=0.5)) self.config.gpu_options.allow_growth = True self.graph = tf.Graph() self.sess = tf.Session(graph=self.graph, config=self.config) input_shape = get_input_shape(dataset) with self.sess.graph.as_default(): with self.sess.as_default(): self.tf_model = convert_pytorch_model_to_tf(self.model) self.ch_model = CallableModelWrapper(self.tf_model, output_layer='logits') self.x_op = tf.placeholder(tf.float32, shape=( None, input_shape[0], input_shape[1], input_shape[2], )) self.attk = ProjectedGradientDescent(self.ch_model, sess=self.sess) self.adv_preds_ops = dict()
def __init__( self, sess, in_tensor_name, out_tensor_name, mean=None, std=None ): callable_model = CleverHansWrapperWrapper( sess, in_tensor_name, out_tensor_name, mean, std ) cleverhans_model = CallableModelWrapper( callable_model, 'logits' ) self.sess = sess self.attack = ProjectedGradientDescent( cleverhans_model, sess=sess ) self.output_ten = tf.get_default_graph().get_tensor_by_name( out_tensor_name ) self.input_ten = tf.get_default_graph().get_tensor_by_name( in_tensor_name ) self.output_shape = [ dim.value for dim in self.output_ten.shape ] self.mean = mean self.std = std
def attack_images(model, tfrecords_dirpath, attack_type='PGD', attack_kwargs=default_attack_kwargs): ''' Attack images (batch = 1 for now) ''' # Get the true label true_label = attack_kwargs['y'] attack_label = attack_kwargs['y_target'] del attack_kwargs['y'] # Define tfrecords input iterator tfrecord_filepaths = glob(os.path.join(tfrecords_dirpath, '*')) tf_dataset = tfutils.make_dataset( tfrecord_filepaths, batch_size=1, filter_label=true_label, preprocessing_fn=preprocess_input ) iterator = tf_dataset.make_one_shot_iterator() x, y = iterator.get_next() # Run the Session attacked_imgs = [] with tf.Session() as sess: # Set attack settings # PGD if attack_type == "PGD": attack = ProjectedGradientDescent(model, sess=sess) # FGM elif attack_type == "FGM": attack = FastGradientMethod(model, sess=sess) target_one_hot_encoded = get_one_hot_encoded_targets(attack_label) attack_kwargs['y_target'] = target_one_hot_encoded # Run the session to generate attacked images x_adv = attack.generate(x, **attack_kwargs) pbar = tqdm(unit='imgs') try: while True: attacked_img = sess.run(x_adv) predicted_class = get_predictions(model, attacked_img) print(predicted_class, attack_label) if predicted_class == attack_label: attacked_imgs.append(attacked_img) pbar.update() except tf.errors.OutOfRangeError: pass if len(attacked_imgs) > 0: attacked_imgs = np.vstack(attacked_imgs) return attacked_imgs
def _get_pert(self, X, Y, eps: float, model): x = tf.placeholder(tf.float32, shape=([None] + list(self.n_features))) y = tf.placeholder(tf.float32, shape=(None, self.n_classes)) wrap = KerasModelWrapper(model) pgd = ProjectedGradientDescent(wrap, ord=self.ord, sess=self.sess) if eps >= 0.05: adv_x = pgd.generate(x, y=y, eps=eps) else: adv_x = pgd.generate(x, y=y, eps=eps, eps_iter=eps) adv_x = tf.stop_gradient(adv_x) ret = adv_x - x return ret.eval(feed_dict={x: X, y: Y}, session=self.sess)
def pgd_attack(): # Use tf for evaluation on adversarial data tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True sess = tf.Session(config=tf_config) x_op = tf.placeholder(tf.float32, shape=( None, 3, 32, 32, )) y_op = tf.placeholder(tf.float32, shape=(None, 10)) # Convert pytorch model to a tf_model and wrap it in cleverhans tf_model_fn = convert_pytorch_model_to_tf(model) cleverhans_model = CallableModelWrapper(tf_model_fn, output_layer='logits') # Create an PGD attack pgd = ProjectedGradientDescent(cleverhans_model, sess=sess) pgd_params = { 'eps': args.eps, 'eps_iter': args.ss, 'nb_iter': args.ns, 'clip_min': 0., 'clip_max': 1., 'y': y_op } adv_x_op = pgd.generate(x_op, **pgd_params) adv_preds_op = tf_model_fn(adv_x_op) # Evaluation against PGD attacks correct = 0 total = 0 for batch_idx, (inputs, targets) in enumerate(test_loader): adv_preds = sess.run(adv_preds_op, feed_dict={ x_op: inputs, y_op: torch.nn.functional.one_hot(targets, 10) }) correct += (np.argmax(adv_preds, axis=1) == targets.numpy()).sum() total += len(inputs) sys.stdout.write("\rWhite-box PGD attack... Acc: %.3f%% (%d/%d)" % (100. * correct / total, correct, total)) sys.stdout.flush() print('Accuracy under PGD attack: %.3f%%' % (100. * correct / total))
class PGDAttack(AdversarialAttack): def __init__(self, model, targeted=False, step_size_iter=0.05, max_perturbation=0.3, n_iterations=10, norm_order=np.inf, rand_init=None, rand_minmax=0.3, clip_min=None, clip_max=None, sanity_checks=True): super().__init__(model=model, clip_min=clip_min, clip_max=clip_max) self._targeted = targeted self._step_size_iter = step_size_iter self._max_perturbation = max_perturbation self._n_iterations = n_iterations self._norm_order = norm_order self._rand_init = rand_init self._rand_minmax = rand_minmax self._sanity_checks = sanity_checks with self.graph.as_default(): self._method = ProjectedGradientDescent( self._model, sess=self.session, eps=self._max_perturbation, eps_iter=self._step_size_iter, nb_iter=self._n_iterations, ord=self._norm_order, rand_init=self._rand_init, clip_min=self._clip_min, clip_max=self._clip_max, sanity_checks=self._sanity_checks) def attack_method(self, labels): if labels is not None: if self._targeted: return self._method.generate(x=self._x_clean, y_target=labels, rand_minmax=self._rand_minmax) else: return self._method.generate(x=self._x_clean, y=labels, rand_minmax=self._rand_minmax) return self._method.generate(x=self._x_clean, rand_minmax=self._rand_minmax)
def _get_pert(self, X, Y, eps): if eps == 0: return np.zeros_like(X) with self.sess.as_default(): self.x = self.wrap.input pgd = ProjectedGradientDescent(self.x, sess=self.sess) adv_x = pgd.generate(self.x, y=self.y, eps=eps, ord=self.ord, eps_iter=0.01) adv_x = tf.stop_gradient(adv_x) pert_x = adv_x - self.x feed_dict = {self.x: X, self.y: Y} ret = pert_x.eval(feed_dict=feed_dict) return ret
def evaluate_checkpoint(filename): if attack_method == 'BIM': bim = BasicIterativeMethod(model) bim_params = { 'eps': 0.3, 'clip_min': 0., 'clip_max': 1., 'nb_iter': 50, 'eps_iter': .01 } adv_x = bim.generate(x_image, **bim_params) elif attack_method == 'FGM': FGM_attack = FastGradientMethod(model) FGM_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x = FGM_attack.generate(x_image, **FGM_params) elif attack_method == 'PGD': pgd = ProjectedGradientDescent(model) pgd_params = { 'eps': 0.09, 'clip_min': 0., 'clip_max': 1., 'nb_iter': 40, 'eps_iter': .01 } adv_x = pgd.generate(x_image, **pgd_params) preds_adv = model.get_probs(adv_x) with tf.Session() as sess: # Restore the checkpoint saver = tf.train.Saver(var_list=model.all_variables) saver.restore(sess, filename) eval_par = {'batch_size': batch_size} t1 = time.time() acc = model_eval(sess, x_image, y, preds_adv, X_test, Y_test, args=eval_par) t2 = time.time() print("Took", t2 - t1, "seconds") print('Test accuracy on adversarial examples: %0.4f\n' % acc)
def single_run_max_confidence_recipe(sess, model, x, y, nb_classes, eps, clip_min, clip_max, eps_iter, nb_iter, report_path, batch_size=BATCH_SIZE): """A reasonable attack bundling recipe for a max norm threat model and a defender that uses confidence thresholding. This recipe uses both uniform noise and randomly-initialized PGD targeted attacks. References: https://openreview.net/forum?id=H1g0piA9tQ This version runs each attack (noise, targeted PGD for each class with nb_iter iterations, target PGD for each class with 25X more iterations) just once and then stops. See `basic_max_confidence_recipe` for a version that runs indefinitely. :param sess: tf.Session :param model: cleverhans.model.Model :param x: numpy array containing clean example inputs to attack :param y: numpy array containing true labels :param nb_classes: int, number of classes :param eps: float, maximum size of perturbation (measured by max norm) :param eps_iter: float, step size for one version PGD attacks (will also run another version with 25X smaller step size) :param nb_iter: int, number of iterations for the cheaper PGD attacks (will also run another version with 25X more iterations) :param report_path: str, the path that the report will be saved to. :batch_size: int, the total number of examples to run simultaneously """ noise_attack = Noise(model, sess) pgd_attack = ProjectedGradientDescent(model, sess) threat_params = {"eps": eps, "clip_min" : clip_min, "clip_max" : clip_max} noise_attack_config = AttackConfig(noise_attack, threat_params, "noise") attack_configs = [noise_attack_config] pgd_attack_configs = [] pgd_params = copy.copy(threat_params) pgd_params["eps_iter"] = eps_iter pgd_params["nb_iter"] = nb_iter assert batch_size % num_devices == 0 dev_batch_size = batch_size // num_devices ones = tf.ones(dev_batch_size, tf.int32) expensive_pgd = [] for cls in range(nb_classes): cls_params = copy.copy(pgd_params) cls_params['y_target'] = tf.to_float(tf.one_hot(ones * cls, nb_classes)) cls_attack_config = AttackConfig(pgd_attack, cls_params, "pgd_" + str(cls)) pgd_attack_configs.append(cls_attack_config) expensive_params = copy.copy(cls_params) expensive_params["eps_iter"] /= 25. expensive_params["nb_iter"] *= 25. expensive_config = AttackConfig(pgd_attack, expensive_params, "expensive_pgd_" + str(cls)) expensive_pgd.append(expensive_config) attack_configs = [noise_attack_config] + pgd_attack_configs + expensive_pgd new_work_goal = {config: 1 for config in attack_configs} goals = [MaxConfidence(t=1., new_work_goal=new_work_goal)] bundle_attacks(sess, model, x, y, attack_configs, goals, report_path)
def build_attack(model, sess, eps=0.3, clip_min=0.0, clip_max=1.0): # Wrap model with cleverhans and init the attack method wrapped_model = KerasModelWrapper(model) pgd = ProjectedGradientDescent(wrapped_model, sess=sess) # Build acc and loss pgd_params = {"eps": eps, "clip_min": clip_min, "clip_max": clip_max} adv_acc_metric = get_adversarial_acc_metric(model, pgd, pgd_params) adv_loss = get_adversarial_loss(model, pgd, pgd_params) return pgd, adv_acc_metric, adv_loss
def init_attack(model, attack_params_dict, sess): """ Initialize the adversarial attack using the cleverhans toolbox Parameters ---------- model : Keras Model The model to attack attack_params_dict : dict Self-defined dictionary specifying the attack and its parameters sess : Session The current tf session Returns ------- attack : cleverhans Attack The Attack object attack_params Dictionary with the value of the attack parameters, valid to generate adversarial examples with cleverhans. """ # Wrapper for the Keras model model_wrap = KerasModelWrapper(model) # Initialize attack batch_size = None if attack_params_dict['attack'] == 'fgsm': attack = FastGradientMethod(model_wrap, sess=sess) attack_params = {'eps': attack_params_dict['eps'], 'clip_min': 0., 'clip_max': 1.} elif attack_params_dict['attack'] == 'spsa': attack = SPSA(model_wrap, sess=sess) attack_params = {'epsilon': attack_params_dict['eps'], 'num_steps': attack_params_dict['n_steps']} batch_size = 1 elif attack_params_dict['attack'] == 'deepfool': attack = DeepFool(model_wrap, sess=sess) attack_params = {'clip_min': 0., 'clip_max': 1.} elif attack_params_dict['attack'] == 'pgd': attack = ProjectedGradientDescent(model_wrap, sess=sess) attack_params = {'eps': attack_params_dict['eps'], 'eps_iter': attack_params_dict['eps_iter'], 'nb_iter': attack_params_dict['n_steps'], 'clip_min': 0., 'clip_max': 1.} elif attack_params_dict['attack'] == 'carlini': attack = CarliniWagnerL2(model_wrap, sess=sess) attack_params = {'clip_min': 0., 'clip_max': 1.} else: raise NotImplementedError() return attack, attack_params, batch_size
def eval_cleverhans(): # Set test phase learning_phase = K.learning_phase() K.set_learning_phase(0) # Pre-process images images_tf = images.astype(K.floatx()) images_tf /= 255. # Wrapper for the Keras model model_wrap = KerasModelWrapper(model) # Initialize attack if attack_params_dict['attack'] == 'fgsm': attack = FastGradientMethod(model_wrap, sess=K.get_session()) attack_params = {'eps': attack_params_dict['eps'], 'clip_min': 0., 'clip_max': 1.} elif attack_params_dict['attack'] == 'deepfool': attack = DeepFool(model_wrap, sess=K.get_session()) attack_params = {'clip_min': 0., 'clip_max': 1.} elif attack_params_dict['attack'] == 'madry': attack = ProjectedGradientDescent(model_wrap, sess=K.get_session()) attack_params = {'clip_min': 0., 'clip_max': 1.} elif attack_params_dict['attack'] == 'carlini': attack = CarliniWagnerL2(model_wrap, sess=K.get_session()) attack_params = {'clip_min': 0., 'clip_max': 1.} else: raise NotImplementedError() # Define input TF placeholder x = tf.placeholder(K.floatx(), shape=(None,) + images.shape[1:]) y = tf.placeholder(K.floatx(), shape=(None,) + (labels.shape[-1],)) # Define adversarial predictions symbolically x_adv = attack.generate(x, **attack_params) x_adv = tf.stop_gradient(x_adv) predictions_adv = model(x_adv) # Evaluate the accuracy of the model on adversarial examples eval_par = {'batch_size': batch_size} # feed_dict = {K.learning_phase(): attack_params_dict['learning_phase']} # acc_adv = model_eval(K.get_session(), x, y, predictions_adv, images, # labels, feed=feed_dict, args=eval_par) acc_adv = model_eval(K.get_session(), x, y, predictions_adv, images_tf, labels, args=eval_par) print('Aversarial accuracy against %s: %.4f\n' % (attack_params_dict['attack'], acc_adv)) # Set original phase K.set_learning_phase(learning_phase) return acc_adv
def get_alp_loss(sess, x, y, logits, adv_logits, model, eps, eps_iter, iterations): if adv_logits is None: pgd_params = { 'ord': np.inf, 'y': y, 'eps': eps / 255, 'eps_iter': eps_iter / 255, 'nb_iter': iterations, 'rand_init': True, 'rand_minmax': eps / 255, 'clip_min': 0., 'clip_max': 1., 'sanity_checks': True } pgd = ProjectedGradientDescent(model, sess=sess) adv_x = pgd.generate(x, **pgd_params) adv_logits = model.get_logits(adv_x) adv_pairing_loss = tf.losses.mean_squared_error(logits, adv_logits) return adv_pairing_loss
class PGDUtil(object): def __init__(self, model, sess, log_file="log.txt"): self.pgd = ProjectedGradientDescent(model=model, sess=sess) self.log_file = log_file def create_adversaries(self, x_train, y_train, i, nb_of_adv=None): if nb_of_adv is None: nb_of_adv = len(x_train) adv_train_x, clean_train_x, adv_train_y, clean_train_y = divide_into_clean_and_adversarial_set( x_train, y_train, nb_of_adv, i) adv_x = self.pgd.generate_np(adv_train_x) return numpy.vstack((adv_x, clean_train_x)), numpy.vstack( (adv_train_y, clean_train_y))
def test_callable_no_softmax(): batch_size = 2 nb_classes = 3 def model(x): return tf.ones((batch_size, nb_classes)) / nb_classes sess = tf.Session() attack = ProjectedGradientDescent(model, sess=sess) x = tf.ones((batch_size, 3)) # Currently ProjectedGradientDescent treats the output of a callable # as probs rather than logits. # Since our callable does not use a softmax, it is impossible to get # the logits back. The test confirms that this causes an error. assert_raises(TypeError, attack.generate, x)
def test_no_logits(): """test_no_logits: Check that a model without logits causes an error""" batch_size = 2 nb_classes = 3 class NoLogitsModel(Model): """ A model that neither defines logits nor makes it possible to find logits by inspecting the inputs to a softmax op. """ def fprop(self, x, **kwargs): return {'probs': tf.ones((batch_size, nb_classes)) / nb_classes} model = NoLogitsModel() sess = tf.Session() attack = ProjectedGradientDescent(model, sess=sess) x = tf.ones((batch_size, 3)) assert_raises(NotImplementedError, attack.generate, x)
class PGDGenerator: def __init__( self, sess, in_tensor_name, out_tensor_name, mean=None, std=None ): callable_model = CleverHansWrapperWrapper( sess, in_tensor_name, out_tensor_name, mean, std ) cleverhans_model = CallableModelWrapper( callable_model, 'logits' ) self.sess = sess self.attack = ProjectedGradientDescent( cleverhans_model, sess=sess ) self.output_ten = tf.get_default_graph().get_tensor_by_name( out_tensor_name ) self.input_ten = tf.get_default_graph().get_tensor_by_name( in_tensor_name ) self.output_shape = [ dim.value for dim in self.output_ten.shape ] self.mean = mean self.std = std def generate( self, img, **kwargs ): if 'y_target' in kwargs: y_target = kwargs[ 'y_target' ] if not isinstance( y_target, (np.ndarray, list) ): y_target = [ y_target ] target_arr = np.zeros(self.output_shape) target = y_target[ np.random.randint(0, len(y_target)) ] target_arr[ 0, target ] = 1 kwargs[ 'y_target' ] = target_arr if 'eps_iter_size' in kwargs and not 'eps_iter' in kwargs and 'eps' in kwargs: kwargs['eps_iter'] = kwargs['eps'] * kwargs['eps_iter_size'] del kwargs['eps_iter_size'] check = True if 'check' in kwargs: check = kwargs['check'] del kwargs['check'] adv_example = self.attack.generate_np( img, **kwargs ) if not self.std is None: adv_example = adv_example.reshape( -1, 1 ) adv_example = ( adv_example - self.mean ) / self.std adv_example = adv_example.reshape( -1 ) #assert ( np.all( adv_example <= kwargs['clip_max'] + 0.000001) and np.all( adv_example >= kwargs['clip_min'] - 0.000001 ) ) real_cl = np.argmax( self.sess.run( self.output_ten, feed_dict={ self.input_ten: adv_example } ) ) if not check: return adv_example if 'y_target' in kwargs and real_cl in y_target: return adv_example if not 'y_target' in kwargs: correct_cl = np.argmax( self.sess.run( self.output_ten, feed_dict={ self.input_ten: img } ) ) if not correct_cl == real_cl: return adv_example return None
def train_child(t, p, m, load_dict=False): # model = nn.DataParallel(TestCNN().cuda(1), device_ids=[1, 2, 3]) raw_model = TestCNN().cuda(0) model = TestCNN().cuda(0) tf_model = convert_pytorch_model_to_tf(model) cleverhans_model = CallableModelWrapper(tf_model, output_layer='logits') session = tf.Session() x_op = tf.placeholder(tf.float32, shape=(None, 3, 32, 32)) fgsm = FastGradientMethod(cleverhans_model, sess=session) # stm = SpatialTransformationMethod(cleverhans_model, sess=session) # cw2 = CarliniWagnerL2(cleverhans_model, sess=session) pgd = ProjectedGradientDescent(cleverhans_model, sess=session) noise = Noise(cleverhans_model, sess=session) mim = MomentumIterativeMethod(cleverhans_model, sess=session) df = DeepFool(cleverhans_model, sess=session) tf_raw_model = convert_pytorch_model_to_tf(raw_model) cleverhans_raw_model = CallableModelWrapper(tf_raw_model, output_layer='logits') # pgd_raw = ProjectedGradientDescent(cleverhans_raw_model, sess=session) noise_raw = Noise(cleverhans_raw_model, sess=session) def fgsm_op(x, eps): att = fgsm.generate(x_op, eps=eps) return session.run(att, feed_dict={x_op: x}) # def stm_op(x, eps): # att = stm.generate(x_op, batch_size=len(x), dx_min=-0.1*eps, dx_max=0.1*eps, dy_min=-0.1*eps, dy_max=0.1*eps, angle_min=-30*eps, angle_max=30*eps) # return session.run(att, feed_dict={x_op: x}) # def cw2_op(x, eps): # att = cw2.generate(x_op, max_iterations=3) def pgd_op(x, eps): att = pgd.generate(x_op, eps=eps, eps_iter=eps * 0.2, nb_iter=3) return session.run(att, feed_dict={x_op: x}) # def pgd_raw_op(x, eps): # att = pgd_raw.generate(x_op, eps=eps, eps_iter=eps * 0.2, nb_iter=3) # return session.run(att, feed_dict={x_op: x}) def noise_op(x, eps): att = noise.generate(x_op, eps=eps) return session.run(att, feed_dict={x_op: x}) def noise_raw_op(x, eps): att = noise_raw.generate(x_op, eps=eps) return session.run(att, feed_dict={x_op: x}) def df_op(x): att = df.generate(x_op, nb_candidate=10, max_iter=3) return session.run(att, feed_dict={x_op: x}) def mim_op(x, eps): att = mim.generate(x_op, eps=eps, eps_iter=eps * 0.2) return session.run(att, feed_dict={x_op: x}) def attack_train(x): attacks = [fgsm_op, pgd_op, mim_op] attacks_name = ['FGSM', 'PGD', 'MIM'] eps = [[0.03, 0.3], [0.03, 0.3], [0.03, 0.3]] train_x_adv = x.copy() adv_type = np.random.randint(SUBPOLICY_COUNT, size=len(train_x_adv)) for i, (ti, pi, mi) in enumerate( tqdm(zip(t, p, m), total=len(t), desc='Subpolicy: ', leave=False)): adv_i = train_x_adv[adv_type == i] for j, (tj, pj, mj) in enumerate( tqdm(zip(ti, pi, mi), total=len(ti), desc='Operation: ', leave=False)): tj, pj, mj = (*tj, *pj, *mj) adv_j = adv_i[np.random.randn(len(adv_i)) < pj] for i in tqdm(range(0, len(adv_j), BATCH_SIZE), desc=attacks_name[tj] + ': ', leave=False): adv_j[i:][:BATCH_SIZE] = attacks[tj]( adv_j[i:][:BATCH_SIZE], (mj + 1) / MAGN_COUNT * (eps[tj][1] - eps[tj][0]) + eps[tj][0]) return train_x_adv optimizer = optim.SGD(model.parameters(), lr=1e-3) raw_optimizer = optim.SGD(raw_model.parameters(), lr=1e-3) train_x_adv = attack_train(train_x) adv_trainset = torch.utils.data.TensorDataset( torch.tensor(train_x_adv, dtype=torch.float), torch.tensor(train_y, dtype=torch.long)) adv_trainloader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) if load_dict: model.load_state_dict(torch.load('black_eval_runs/model.pt')) optimizer.load_state_dict(torch.load('black_eval_runs/optimizer.pt')) raw_model.load_state_dict(torch.load('black_eval_runs/raw_model.pt')) raw_optimizer.load_state_dict( torch.load('black_eval_runs/raw_optimizer.pt')) model.train() batch_tqdm = tqdm(adv_trainloader, leave=False) for x, y in batch_tqdm: optimizer.zero_grad() output = model(x.cuda(0)) loss = criterion(output, y.cuda(0)) loss.backward() optimizer.step() acc = torch.sum(output.cpu().argmax(axis=1) == y) / y.size(0) batch_tqdm.set_description(f'adv {loss:.3f} {acc:.3f}') batch_tqdm = tqdm(trainloader, leave=False) raw_model.train() for x, y in batch_tqdm: raw_optimizer.zero_grad() output = raw_model(x.cuda(0)) loss = criterion(output, y.cuda(0)) loss.backward() raw_optimizer.step() acc = torch.sum(output.cpu().argmax(axis=1) == y) / y.size(0) batch_tqdm.set_description(f'raw {loss:.3f} {acc:.3f}') with torch.no_grad(): model.eval() batch_tqdm = tqdm(valloader, leave=False) tot_acc = 0 for x, y in batch_tqdm: output = model(x.cuda(0)) acc = float(torch.sum(output.cpu().argmax(axis=1) == y)) tot_acc += acc adv_raw_acc = tot_acc / len(val_x) val_x_adv = np.zeros_like(val_x) for i in tqdm(range(0, len(val_x_adv), BATCH_SIZE), desc='Noise: ', leave=False): val_x_adv[i:][:BATCH_SIZE] = noise_op(val_x[i:][:BATCH_SIZE], 0.3) adv_valset = torch.utils.data.TensorDataset( torch.tensor(val_x_adv, dtype=torch.float), torch.tensor(val_y, dtype=torch.long)) adv_valloader = torch.utils.data.DataLoader(adv_valset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4) batch_tqdm = tqdm(adv_valloader, leave=False) tot_acc = 0 for x, y in batch_tqdm: output = model(x.cuda(0)) acc = float(torch.sum(output.cpu().argmax(axis=1) == y)) tot_acc += acc adv_adv_acc = tot_acc / len(val_x) raw_model.eval() batch_tqdm = tqdm(valloader, leave=False) tot_acc = 0 for x, y in batch_tqdm: output = raw_model(x.cuda(0)) acc = float(torch.sum(output.cpu().argmax(axis=1) == y)) tot_acc += acc raw_raw_acc = tot_acc / len(val_x) val_x_adv = np.zeros_like(val_x) for i in tqdm(range(0, len(val_x_adv), BATCH_SIZE), desc='Noise: ', leave=False): val_x_adv[i:][:BATCH_SIZE] = noise_raw_op(val_x[i:][:BATCH_SIZE], 0.3) adv_valset = torch.utils.data.TensorDataset( torch.tensor(val_x_adv, dtype=torch.float), torch.tensor(val_y, dtype=torch.long)) adv_valloader = torch.utils.data.DataLoader(adv_valset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4) batch_tqdm = tqdm(adv_valloader, leave=False) tot_acc = 0 for x, y in batch_tqdm: output = raw_model(x.cuda(0)) acc = float(torch.sum(output.cpu().argmax(axis=1) == y)) tot_acc += acc raw_adv_acc = tot_acc / len(val_x) with open('black_eval_runs/acc.csv', 'a') as f: f.write(f'{adv_raw_acc},{adv_adv_acc},{raw_raw_acc},{raw_adv_acc}\n') print( f'adv {adv_raw_acc:.3f} -> {adv_adv_acc:.3f} | raw {raw_raw_acc:.3f} -> {raw_adv_acc:.3f}' ) torch.save(model.state_dict(), 'black_eval_runs/model.pt') torch.save(optimizer.state_dict(), 'black_eval_runs/optimizer.pt') torch.save(raw_model.state_dict(), 'black_eval_runs/raw_model.pt') torch.save(raw_optimizer.state_dict(), 'black_eval_runs/raw_optimizer.pt')
(X_train, y_train), (X_test, y_test) = cifar10.load_data() if __name__ == '__main__': model_keras = keras.models.load_model('../models_test/model_cifar_2.h5') # model_keras = keras.models.load_model('../models_test/keras_cifar10_trained_model.h5') batch_size = 512 success = 0 data_size = X_train.shape[0] adv_train = [] time_st=time.time() for st in range(0, data_size, batch_size): sample = np.array(X_train[st : st + batch_size].reshape(-1, 32 * 32 * 3) / 255, dtype=np.float) # sample = np.array([sample]) sess = keras.backend.get_session() model = KerasModelWrapper(model_keras) attack = ProjectedGradientDescent(model, sess=sess) # print(model.predict(panda.reshape(1, *panda.shape))) param = dict( eps= 10 / 255, eps_iter= 10 / 255 / 40, nb_iter= 40, rand_init= True, ) advs = attack.generate_np(sample, **param) # plt.imsave("sample.png", advs[0]) adv_train.append(advs) preb = model_keras.predict(advs).argmax(axis=1).reshape((sample.shape[0], )) y_sample = model_keras.predict(sample).argmax(axis=1).reshape((sample.shape[0], )) success += (preb != y_sample).sum() print((preb != y_sample).sum())
def impl(sess, model, dataset, factory, x_data, y_data, base_eps_iter=BASE_EPS_ITER, nb_iter=NB_ITER, batch_size=BATCH_SIZE): """ The actual implementation of the evaluation. :param sess: tf.Session :param model: cleverhans.model.Model :param dataset: cleverhans.dataset.Dataset :param factory: the dataset factory corresponding to `dataset` :param x_data: numpy array of input examples :param y_data: numpy array of class labels :param base_eps_iter: step size for PGD if data were in [0, 1] :param nb_iter: number of PGD iterations :returns: dict mapping string adversarial example names to accuracies """ center = dataset.kwargs['center'] max_val = dataset.kwargs['max_val'] value_range = max_val * (1. + center) min_value = 0. - center * max_val if 'CIFAR' in str(factory.cls): base_eps = 8. / 255. if base_eps_iter is None: base_eps_iter = 2. / 255. elif 'MNIST' in str(factory.cls): base_eps = .3 if base_eps_iter is None: base_eps_iter = .1 else: raise NotImplementedError(str(factory.cls)) pgd_params = { 'eps': base_eps * value_range, 'eps_iter': base_eps_iter * value_range, 'nb_iter': nb_iter, 'clip_min': min_value, 'clip_max': max_val } semantic = Semantic(model, center, max_val, sess) pgd = ProjectedGradientDescent(model, sess=sess) jobs = [('clean', None, None, None), ('Semantic', semantic, None, None), ('pgd', pgd, pgd_params, None)] out = {} for job in jobs: name, attack, attack_params, job_batch_size = job if job_batch_size is None: job_batch_size = batch_size t1 = time.time() acc = accuracy(sess, model, x_data, y_data, batch_size=job_batch_size, devices=devices, attack=attack, attack_params=attack_params) t2 = time.time() out[name] = acc print("Accuracy on " + name + " examples: ", acc) print("Evaluation took", t2 - t1, "seconds") return out
def main(argv): del argv if FLAGS.debug: logging.info('Running in debug mode!!!') random.seed(FLAGS.seed) np.random.seed(FLAGS.seed) tf.set_random_seed(FLAGS.seed) TFRECORDS_DIR = FLAGS.tfrecords_dir HDF5_DATA_PATH = FLAGS.hdf5_data_path tfrecord_filepaths = glob(os.path.join(TFRECORDS_DIR, '*')) tf_dataset = tfutils.make_dataset(tfrecord_filepaths, batch_size=1, filter_label=FLAGS.label, preprocessing_fn=preprocess_input) hdf5_dataset = None if not FLAGS.debug: hdf5_file = h5py.File(HDF5_DATA_PATH, 'a') hdf5_group = get_attack_group_name(O_ATTACK_NAME, FLAGS.label) hdf5_dataset = hdf5utils.create_image_dataset(hdf5_file, group=hdf5_group, attrs={ 'seed': FLAGS.seed, 'eps': FLAGS.eps, 'ord': FLAGS.ord, 'eps_iter': FLAGS.eps_iter, 'nb_iter': FLAGS.nb_iter, 'target': FLAGS.target }) model = InceptionV1Model() iterator = tf_dataset.make_one_shot_iterator() x, y = iterator.get_next() with tf.Session() as sess: attack = ProjectedGradientDescent(model, sess=sess) target_one_hot_encoded = get_one_hot_encoded_targets(FLAGS.target) x_adv = attack.generate( x, eps=FLAGS.eps, nb_iter=FLAGS.nb_iter, eps_iter=FLAGS.eps_iter, ord=(int(FLAGS.ord) if FLAGS.ord != 'inf' else np.inf), y_target=target_one_hot_encoded) pbar = tqdm(unit='imgs') try: while True: attacked_imgs = sess.run(x_adv) if not FLAGS.debug: hdf5utils.add_images_to_dataset(attacked_imgs, hdf5_dataset) pbar.update() except tf.errors.OutOfRangeError: pass
print(results) print("results on target model: ") results = metrics(model_target, X_adv, X_test, y_test, indices) print(results) #####BIM print("BIM") bim_params = {'eps': 0.03, 'nb_iter': 300, 'eps_iter': 0.03/100, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1., 'rand_init': False } bim = ProjectedGradientDescent(wrap, sess=sess) X_adv = np.zeros((len(indices),32,32,3)) for i in range(0,len(indices),batch_attack): X_adv[i:i+batch_attack] = bim.generate_np(X_test[indices[i:(i+batch_attack)]], **bim_params) print("results on source model: ") results = metrics(model, X_adv, X_test, y_test, indices) print(results) print("results on target model: ") results = metrics(model_target, X_adv, X_test, y_test, indices) print(results) #####CWL2 print("CWL2") cwl2_params = {'binary_search_steps': 10, 'max_iterations': 100, 'learning_rate': 0.1,
def fixed_max_confidence_recipe(sess, model, x, y, nb_classes, eps, clip_min, clip_max, eps_iter, nb_iter, report_path, batch_size=BATCH_SIZE): """A reasonable attack bundling recipe for a max norm threat model and a defender that uses confidence thresholding. References: https://openreview.net/forum?id=H1g0piA9tQ This version runs each attack a fixed number of times. It is more exhaustive than `single_run_max_confidence_recipe` but because it uses a fixed budget rather than running indefinitely it is more appropriate for making fair comparisons between two models. :param sess: tf.Session :param model: cleverhans.model.Model :param x: numpy array containing clean example inputs to attack :param y: numpy array containing true labels :param nb_classes: int, number of classes :param eps: float, maximum size of perturbation (measured by max norm) :param eps_iter: float, step size for one version of PGD attacks (will also run another version with 25X smaller step size) :param nb_iter: int, number of iterations for one version of PGD attacks (will also run another version with 25X more iterations) :param report_path: str, the path that the report will be saved to. :batch_size: int, the total number of examples to run simultaneously """ noise_attack = Noise(model, sess) pgd_attack = ProjectedGradientDescent(model, sess) threat_params = {"eps": eps, "clip_min": clip_min, "clip_max": clip_max} noise_attack_config = AttackConfig(noise_attack, threat_params) attack_configs = [noise_attack_config] pgd_attack_configs = [] pgd_params = copy.copy(threat_params) pgd_params["eps_iter"] = eps_iter pgd_params["nb_iter"] = nb_iter assert batch_size % num_devices == 0 dev_batch_size = batch_size // num_devices ones = tf.ones(dev_batch_size, tf.int32) expensive_pgd = [] for cls in range(nb_classes): cls_params = copy.copy(pgd_params) cls_params['y_target'] = tf.to_float(tf.one_hot( ones * cls, nb_classes)) cls_attack_config = AttackConfig(pgd_attack, cls_params, "pgd_" + str(cls)) pgd_attack_configs.append(cls_attack_config) expensive_params = copy.copy(cls_params) expensive_params["eps_iter"] /= 25. expensive_params["nb_iter"] *= 25. expensive_config = AttackConfig(pgd_attack, expensive_params, "expensive_pgd_" + str(cls)) expensive_pgd.append(expensive_config) attack_configs = [noise_attack_config] + pgd_attack_configs + expensive_pgd new_work_goal = {config: 5 for config in attack_configs} pgd_work_goal = {config: 5 for config in pgd_attack_configs} # TODO: lower priority: make sure bundler won't waste time running targeted # attacks on examples where the target class is the true class goals = [ Misclassify(new_work_goal={noise_attack_config: 50}), Misclassify(new_work_goal=pgd_work_goal), MaxConfidence(t=0.5, new_work_goal=new_work_goal), MaxConfidence(t=0.75, new_work_goal=new_work_goal), MaxConfidence(t=0.875, new_work_goal=new_work_goal), MaxConfidence(t=0.9375, new_work_goal=new_work_goal), MaxConfidence(t=0.96875, new_work_goal=new_work_goal), MaxConfidence(t=0.984375, new_work_goal=new_work_goal), MaxConfidence(t=1., new_work_goal=new_work_goal) ] bundle_attacks(sess, model, x, y, attack_configs, goals, report_path)
def eval(sess, model_name, X_train, Y_train, X_test, Y_test, cnn=False, rbf=False, fgsm=False, jsma=False, df=False, bim=False): """ Load model saved in model_name.json and model_name_weights.h5 and evaluate its accuracy on legitimate test samples and adversarial samples. Use cnn=True if the model is CNN based. """ # open text file and output accuracy results to it text_file = open("mnist_results.txt", "w") # load saved model print("Load model ... ") ''' json = open('models/{}.json'.format(model_name), 'r') model = json.read() json.close() loaded_model = model_from_json(model) loaded_model.load_weights("models/{}_weights.h5".format(model_name)) ''' if rbf: loaded_model = load_model("rbfmodels/{}.h5".format(model_name), custom_objects={'RBFLayer': RBFLayer}) text_file.write('Evaluating on rbfmodels/{}.h5\n\n'.format(model_name)) else: loaded_model = load_model("models/{}.h5".format(model_name)) text_file.write('Evaluating on models/{}.h5\n\n'.format(model_name)) # Set placeholders if cnn: x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) else: x = tf.placeholder(tf.float32, shape=(None, 784)) y = tf.placeholder(tf.float32, shape=(None, 10)) predictions = loaded_model(x) accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args={ "batch_size" : 128 }) text_file.write('Test accuracy on legitimate test examples: {0}\n'.format(str(accuracy))) #print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Craft adversarial examples depending on the input parameters wrap = KerasModelWrapper(loaded_model) # FGSM if fgsm: fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3} adv_x = fgsm.generate(x, **fgsm_params) adv_x = tf.stop_gradient(adv_x) preds_adv = loaded_model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples accuracy = model_eval(sess, x, y, preds_adv, X_test, Y_test, args={ "batch_size" : 128}) text_file.write('Test accuracy on fgsm adversarial test examples: {0}\n'.format(str(accuracy))) #print('Test accuracy on fgsm adversarial test examples: ' + str(accuracy)) # JSMA if jsma: jsma = SaliencyMapMethod(wrap, sess=sess) jsma_params = {'theta': 2., 'gamma': 0.145, 'clip_min': 0., 'clip_max': 1., 'y_target': None} adv_x = jsma.generate(x, **jsma_params) adv_x = tf.stop_gradient(adv_x) preds_adv = loaded_model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples accuracy = model_eval(sess, x, y, preds_adv, X_test, Y_test, args={ "batch_size" : 128}) text_file.write('Test accuracy on jsma adversarial test examples: {0}\n'.format(str(accuracy))) #print('Test accuracy on jsma adversarial test examples: ' + str(accuracy)) # DeepFool if df: df = DeepFool(wrap, sess=sess) df_params = {'nb_candidate': 10, 'max_iter': 50} adv_x = df.generate(x, **df_params) adv_x = tf.stop_gradient(adv_x) preds_adv = loaded_model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples accuracy = model_eval(sess, x, y, preds_adv, X_test, Y_test, args={ "batch_size" : 128}) text_file.write('Test accuracy on df adversarial test examples: {0}\n'.format(str(accuracy))) #print('Test accuracy on df adversarial test examples: ' + str(accuracy)) # Basic Iterative Method if bim: bim = ProjectedGradientDescent(wrap, sess=sess) bim_params = {'eps': 0.3} adv_x = bim.generate(x, **bim_params) adv_x = tf.stop_gradient(adv_x) preds_adv = loaded_model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples accuracy = model_eval(sess, x, y, preds_adv, X_test, Y_test, args={ "batch_size" : 128}) text_file.write('Test accuracy on bim adversarial test examples: {0}\n'.format(str(accuracy))) #print('Test accuracy on bim adversarial test examples: ' + str(accuracy)) print('Accuracy results outputted to mnist_results.txt') text_file.close() # Close TF session sess.close()
def gen_adv_data(model, x, y, method, dataset, batch=2048): sess = K.get_session() model_wrap = KerasModelWrapper(model) if method.upper() == 'CW': params = { 'binary_search_steps': 1, 'y': y, 'learning_rate': .1, 'max_iterations': 50, 'initial_const': 10, 'batch_size': batch, # 'clip_min': -0.5, # 'clip_max': 0.5 } attack = CarliniWagnerL2(model_wrap, sess=sess) data_num = x.shape[0] begin, end = 0, batch adv_x_all = np.zeros_like(x) # every time process batch_size while end < data_num: start_time = time.time() params['y'] = y[begin:end] adv_x = attack.generate_np(x[begin:end], **params) adv_x_all[begin:end] = adv_x print(begin, end, "done") begin += batch end += batch end_time = time.time() print("time: ", end_time - start_time) # process the remaining if begin < data_num: start_time = time.time() params['y'] = y[begin:] params['batch_size'] = data_num - begin adv_x = attack.generate_np(x[begin:], **params) adv_x_all[begin:] = adv_x print(begin, data_num, "done") end_time = time.time() print("time: ", end_time - start_time) elif method.upper() == 'PGD': if dataset == 'cifar': params = { 'eps': 16. / 255., 'eps_iter': 2. / 255., 'nb_iter': 30., # 'clip_min': -0.5, # 'clip_max': 0.5, 'y': y } attack = ProjectedGradientDescent(model_wrap, sess=sess) elif dataset == 'mnist': params = { 'eps': .3, 'eps_iter': .03, 'nb_iter': 20., 'clip_min': -0.5, 'clip_max': 0.5, 'y': y } attack = ProjectedGradientDescent(model_wrap, sess=sess) elif dataset == 'svhn': params = { 'eps': 8. / 255., 'eps_iter': 0.01, 'nb_iter': 30., 'clip_min': -0.5, 'clip_max': 0.5, 'y': y } attack = ProjectedGradientDescent(model_wrap, sess=sess) data_num = x.shape[0] begin, end = 0, batch adv_x_all = np.zeros_like(x) # every time process batch_size while end < data_num: start_time = time.time() params['y'] = y[begin:end] adv_x = attack.generate_np(x[begin:end], **params) adv_x_all[begin:end] = adv_x print(begin, end, "done") begin += batch end += batch end_time = time.time() print("time: ", end_time - start_time) # process the remaining if begin < data_num: start_time = time.time() params['y'] = y[begin:] adv_x = attack.generate_np(x[begin:], **params) adv_x_all[begin:] = adv_x print(begin, data_num, "done") end_time = time.time() print("time: ", end_time - start_time) else: print('Unsupported attack') sys.exit(1) return adv_x_all
def train(ARGS): # Define helper function for evaluating on test data during training def eval(epoch): from train_utils import clean_eval test_accuracy, test_loss, _ = clean_eval(sess, x, y, is_training, testloader, n_classes, logits, preds) # Write tensorboard summary acc_summary = tf.Summary() acc_summary.value.add(tag='Evaluation/accuracy/test', simple_value=test_accuracy) writer_test.add_summary(acc_summary, epoch) # Write tensorboard summary err_summary = tf.Summary() err_summary.value.add(tag='Evaluation/error/test', simple_value=1.0 - test_accuracy) writer_test.add_summary(err_summary, epoch) # Write tensorboard summary loss_summary = tf.Summary() loss_summary.value.add(tag='Evaluation/loss/test', simple_value=test_loss) writer_test.add_summary(loss_summary, epoch) # Define helper function for evaluating on adversarial test data during training def adv_eval(epoch): from train_utils import adversarial_eval adv_accuracy, adv_loss = adversarial_eval(sess, x, y, is_training, adv_testloader, n_classes, preds, adv_preds, eval_all=True) # Write tensorboard summary acc_summary = tf.Summary() acc_summary.value.add(tag='Evaluation/adversarial-accuracy/test', simple_value=adv_accuracy) writer_test.add_summary(acc_summary, epoch) # Write tensorboard summary err_summary = tf.Summary() err_summary.value.add(tag='Evaluation/adversarial-error/test', simple_value=1.0 - adv_accuracy) writer_test.add_summary(err_summary, epoch) # Write tensorboard summary loss_summary = tf.Summary() loss_summary.value.add(tag='Evaluation/adversarial-loss/test', simple_value=adv_loss) writer_test.add_summary(loss_summary, epoch) # Define computational graph with tf.Graph().as_default() as g: # Define placeholders with tf.device('/gpu:0'): with tf.name_scope('Placeholders'): x = tf.placeholder(dtype=tf.float32, shape=input_shape, name='inputs') x_pair1 = tf.placeholder(dtype=tf.float32, shape=input_shape, name='x-pair1') x_pair2 = tf.placeholder(dtype=tf.float32, shape=input_shape, name='x-pair2') y = tf.placeholder(dtype=tf.float32, shape=(None, n_classes), name='labels') is_training = tf.placeholder_with_default(True, shape=(), name='is-training') # Define TF session config = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(graph=g, config=config) # Define model with tf.name_scope('Model'): with tf.device('/gpu:0'): model = Model(nb_classes=n_classes, input_shape=input_shape, is_training=is_training) # Define forward-pass with tf.name_scope('Logits'): logits = model.get_logits(x) with tf.name_scope('Probs'): preds = tf.nn.softmax(logits) with tf.name_scope('Accuracy'): ground_truth = tf.argmax(y, axis=1) predicted_label = tf.argmax(preds, axis=1) correct_prediction = tf.equal(predicted_label, ground_truth) acc = tf.reduce_mean(tf.to_float(correct_prediction), name='accuracy') tf.add_to_collection('accuracies', acc) err = tf.identity(1.0 - acc, name='error') tf.add_to_collection('accuracies', err) # Define losses with tf.name_scope('Losses'): ce_loss, wd_loss, clp_loss, lsq_loss, at_loss, alp_loss = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 adv_logits = None if ARGS.ct: with tf.name_scope('Cross-Entropy-Loss'): ce_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=y), name='cross-entropy-loss') tf.add_to_collection('losses', ce_loss) if ARGS.at: with tf.name_scope('Adversarial-Cross-Entropy-Loss'): at_loss, adv_logits = get_at_loss( sess, x, y, model, ARGS.eps, ARGS.eps_iter, ARGS.nb_iter) at_loss = tf.identity(at_loss, name='at-loss') tf.add_to_collection('losses', at_loss) with tf.name_scope('Regularizers'): if ARGS.wd: with tf.name_scope('Weight-Decay'): for var in tf.trainable_variables(): if 'beta' in var.op.name: # Do not regularize bias of batch normalization continue # print('regularizing: ', var.op.name) wd_loss += tf.nn.l2_loss(var) reg_loss = tf.identity(wd_loss, name='wd-loss') tf.add_to_collection('losses', reg_loss) if ARGS.alp: with tf.name_scope('Adversarial-Logit-Pairing'): alp_loss = get_alp_loss( sess, x, y, logits, adv_logits, model, ARGS.eps, ARGS.eps_iter, ARGS.nb_iter) alp_loss = tf.identity(alp_loss, name='alp-loss') tf.add_to_collection('losses', alp_loss) if ARGS.clp: with tf.name_scope('Clean-Logit-Pairing'): clp_loss = get_clp_loss( x_pair1, x_pair2, model) clp_loss = tf.identity(clp_loss, name='clp-loss') tf.add_to_collection('losses', clp_loss) if ARGS.lsq: with tf.name_scope('Logit-Squeezing'): lsq_loss = get_lsq_loss(x, model) lsq_loss = tf.identity(lsq_loss, name='lsq-loss') tf.add_to_collection('losses', lsq_loss) with tf.name_scope('Total-Loss'): # Define objective function total_loss = (ARGS.ct_lambda * ce_loss) + ( ARGS.at_lambda * at_loss) + (ARGS.wd_lambda * wd_loss) + ( ARGS.clp_lambda * clp_loss) + (ARGS.lsq_lambda * lsq_loss) + ( ARGS.alp_lambda * alp_loss) total_loss = tf.identity(total_loss, name='total-loss') tf.add_to_collection('losses', total_loss) # Define PGD adversary with tf.name_scope('PGD-Attacker'): pgd_params = { 'ord': np.inf, 'y': y, 'eps': ARGS.eps / 255, 'eps_iter': ARGS.eps_iter / 255, 'nb_iter': ARGS.nb_iter, 'rand_init': True, 'rand_minmax': ARGS.eps / 255, 'clip_min': 0., 'clip_max': 1., 'sanity_checks': True } pgd = ProjectedGradientDescent(model, sess=sess) adv_x = pgd.generate(x, **pgd_params) with tf.name_scope('Logits'): adv_logits = model.get_logits(adv_x) with tf.name_scope('Probs'): adv_preds = tf.nn.softmax(adv_logits) # Define optimizer with tf.device('/gpu:0'): with tf.name_scope('Optimizer'): # Define global step variable global_step = tf.get_variable( name='global_step', shape=[], # scalar dtype=tf.float32, initializer=tf.zeros_initializer(), trainable=False) optimizer = tf.train.AdamOptimizer(learning_rate=ARGS.lr, beta1=0.9, beta2=0.999, epsilon=1e-6, use_locking=False, name='Adam') trainable_vars = tf.trainable_variables() update_bn_ops = tf.get_collection( tf.GraphKeys.UPDATE_OPS ) # this collection stores the moving_mean and moving_variance ops # for batch normalization with tf.control_dependencies(update_bn_ops): grads_and_vars = optimizer.compute_gradients( total_loss, trainable_vars) train_step = optimizer.apply_gradients( grads_and_vars, global_step=global_step) # Add Tensorboard summaries with tf.device('/gpu:0'): # Create file writers writer_train = tf.summary.FileWriter(ARGS.log_dir + '/train', graph=g) writer_test = tf.summary.FileWriter(ARGS.log_dir + '/test') # Add summary for input images with tf.name_scope('Image-Summaries'): # Create image summary ops tf.summary.image('input', x, max_outputs=2, collections=['training']) # Add summaries for the training losses losses = tf.get_collection('losses') for entry in losses: tf.summary.scalar(entry.name, entry, collections=['training']) # Add summaries for the training accuracies accs = tf.get_collection('accuracies') for entry in accs: tf.summary.scalar(entry.name, entry, collections=['training']) # Add summaries for all trainable vars for var in trainable_vars: tf.summary.histogram(var.op.name, var, collections=['training']) var_norm = tf.norm(var, ord='euclidean') tf.summary.scalar(var.op.name + '/l2norm', var_norm, collections=['training']) # Add summaries for variable gradients for grad, var in grads_and_vars: if grad is not None: tf.summary.histogram(var.op.name + '/gradients', grad, collections=['training']) grad_norm = tf.norm(grad, ord='euclidean') tf.summary.scalar(var.op.name + '/gradients/l2norm', grad_norm, collections=['training']) # Add summaries for the logits and model predictions with tf.name_scope('Logits-Summaries'): variable_summaries(tf.identity(logits, name='logits'), name='logits', collections=['training', 'test'], histo=True) with tf.name_scope('Predictions-Summaries'): variable_summaries(tf.identity(preds, name='predictions'), name='predictions', collections=['training', 'test'], histo=True) # Initialize all variables with sess.as_default(): tf.global_variables_initializer().run() # Collect training params train_params = { 'epochs': ARGS.epochs, 'eval_step': ARGS.eval_step, 'adv_eval_step': ARGS.adv_eval_step, 'n_classes': n_classes, 'clp': ARGS.clp } # Start training loop model_train(sess, x, y, x_pair1, x_pair2, is_training, trainloader, train_step, args=train_params, evaluate=eval, adv_evaluate=adv_eval, writer_train=writer_train) # Save the trained model if ARGS.save: save_path = os.path.join(ARGS.save_dir, ARGS.filename) saver = tf.train.Saver(var_list=tf.global_variables()) saver.save(sess, save_path) print("Saved model at {:s}".format(str(ARGS.save_dir)))
def attack(self, path, session): print_and_log(self.logfile, "") # add a blank line print_and_log(self.logfile, 'Attacking model {0:}: '.format(path)) self.model = self.init_model() self.model.load_state_dict(torch.load(path)) pgd_parameters = self.pgd_params() class_index = 0 context_images, target_images, context_labels, target_labels, context_images_np = None, None, None, None, None def model_wrapper(context_point_x): # Insert context_point at correct spot context_images_attack = torch.cat([ context_images[0:class_index], context_point_x, context_images[class_index + 1:] ], dim=0) target_logits = self.model(context_images_attack, context_labels, target_images) return target_logits[0] tf_model_conv = convert_pytorch_model_to_tf(model_wrapper, out_dims=self.args.way) tf_model = cleverhans.model.CallableModelWrapper( tf_model_conv, 'logits') pgd = ProjectedGradientDescent(tf_model, sess=session, dtypestr='float32') for item in self.test_set: for t in range(self.args.attack_tasks): task_dict = self.dataset.get_test_task(item, session) context_images, target_images, context_labels, target_labels, context_images_np = self.prepare_task( task_dict, shuffle=False) # Detach shares storage with the original tensor, which isn't what we want. context_images_attack_all = context_images.clone() # Is require_grad true here, for context_images? for c in torch.unique(context_labels): # Adversarial input context image class_index = extract_class_indices(context_labels, c)[0].item() context_x = np.expand_dims(context_images_np[class_index], 0) # Input to the model wrapper is automatically converted to Torch tensor for us x = tf.placeholder(tf.float32, shape=context_x.shape) adv_x_op = pgd.generate(x, **pgd_parameters) preds_adv_op = tf_model.get_logits(adv_x_op) feed_dict = {x: context_x} adv_x, preds_adv = session.run((adv_x_op, preds_adv_op), feed_dict=feed_dict) context_images_attack_all[class_index] = torch.from_numpy( adv_x) save_image(adv_x, os.path.join(self.checkpoint_dir, 'adv.png')) save_image(context_x, os.path.join(self.checkpoint_dir, 'in.png')) acc_after = torch.mean( torch.eq( target_labels, torch.argmax(torch.from_numpy(preds_adv).to( self.device), dim=-1)).float()).item() with torch.no_grad(): logits = self.model(context_images, context_labels, target_images) acc_before = torch.mean( torch.eq(target_labels, torch.argmax(logits, dim=-1)).float()).item() del logits diff = acc_before - acc_after print_and_log( self.logfile, "Task = {}, Class = {} \t Diff = {}".format( t, c, diff)) print_and_log(self.logfile, "Accuracy before {}".format(acc_after)) logits = self.model(context_images_attack_all, context_labels, target_images) acc_all_attack = torch.mean( torch.eq(target_labels, torch.argmax(logits, dim=-1)).float()).item() print_and_log(self.logfile, "Accuracy after {}".format(acc_all_attack))
def setUp(self): super(TestProjectedGradientDescent, self).setUp() self.attack = ProjectedGradientDescent(self.model, sess=self.sess)