def main(_): """Run the sample attack""" # Images for inception classifier are normalized to be in [-1, 1] interval, # eps is a difference between pixels so it should be in [0, 2] interval. # Renormalizing epsilon from [0, 255] to [0, 2]. eps = 2.0 * FLAGS.max_epsilon / 255.0 batch_shape = [FLAGS.batch_size, FLAGS.image_height, FLAGS.image_width, 3] nb_classes = 1001 tf.logging.set_verbosity(tf.logging.INFO) with tf.Graph().as_default(): # Prepare graph x_input = tf.placeholder(tf.float32, shape=batch_shape) model = InceptionModel(nb_classes) fgsm = FastGradientMethod(model) x_adv = fgsm.generate(x_input, eps=eps, clip_min=-1., clip_max=1.) # Run computation saver = tf.train.Saver(slim.get_model_variables()) session_creator = tf.train.ChiefSessionCreator( scaffold=tf.train.Scaffold(saver=saver), checkpoint_filename_with_path=FLAGS.checkpoint_path, master=FLAGS.master) with tf.train.MonitoredSession( session_creator=session_creator) as sess: for filenames, images in load_images(FLAGS.input_dir, batch_shape): adv_images = sess.run(x_adv, feed_dict={x_input: images}) save_images(adv_images, filenames, FLAGS.output_dir)
def main(argv): checkpoint = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) if checkpoint is None: raise ValueError("Couldn't find latest checkpoint in " + FLAGS.checkpoint_dir) train_start = 0 train_end = 60000 test_start = 0 test_end = 10000 X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) assert Y_train.shape[1] == 10 # NOTE: for compatibility with Madry Lab downloadable checkpoints, # we cannot enclose this in a scope or do anything else that would # change the automatic naming of the variables. model = MadryMNIST() x_input = tf.placeholder(tf.float32, shape=[None, 784]) x_image = tf.placeholder(tf.float32, shape=[None, 28, 28, 1]) y = tf.placeholder(tf.float32, shape=[None, 10]) if FLAGS.attack_type == 'fgsm': fgsm = FastGradientMethod(model) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate(x_image, **fgsm_params) elif FLAGS.attack_type == 'bim': bim = BasicIterativeMethod(model) bim_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1., 'nb_iter': 50, 'eps_iter': .01} adv_x = bim.generate(x_image, **bim_params) else: raise ValueError(FLAGS.attack_type) preds_adv = model.get_probs(adv_x) saver = tf.train.Saver() with tf.Session() as sess: # Restore the checkpoint saver.restore(sess, checkpoint) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': FLAGS.batch_size} t1 = time.time() acc = model_eval( sess, x_image, y, preds_adv, X_test, Y_test, args=eval_par) t2 = time.time() print("Took", t2 - t1, "seconds") print('Test accuracy on adversarial examples: %0.4f\n' % acc)
def get_logits_over_interval(sess, model, x_data, fgsm_params, min_epsilon=-10., max_epsilon=10., num_points=21): """Get logits when the input is perturbed in an interval in adv direction. Args: sess: Tf session model: Model for which we wish to get logits. x_data: Numpy array corresponding to single data. point of shape [height, width, channels]. fgsm_params: Parameters for generating adversarial examples. min_epsilon: Minimum value of epsilon over the interval. max_epsilon: Maximum value of epsilon over the interval. num_points: Number of points used to interpolate. Returns: Numpy array containing logits. Raises: ValueError if min_epsilon is larger than max_epsilon. """ # Get the height, width and number of channels height = x_data.shape[0] width = x_data.shape[1] channels = x_data.shape[2] x_data = np.expand_dims(x_data, axis=0) import tensorflow as tf from src.FGSM.cleverhans.cleverhans.attacks import FastGradientMethod # Define the data placeholder x = tf.placeholder(dtype=tf.float32, shape=[1, height, width, channels], name='x') # Define adv_x fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) if min_epsilon > max_epsilon: raise ValueError('Minimum epsilon is less than maximum epsilon') eta = tf.nn.l2_normalize(adv_x - x, dim=0) epsilon = tf.reshape( tf.lin_space(float(min_epsilon), float(max_epsilon), num_points), (num_points, 1, 1, 1)) lin_batch = x + epsilon * eta logits = model.get_logits(lin_batch) with sess.as_default(): log_prob_adv_array = sess.run(logits, feed_dict={x: x_data}) return log_prob_adv_array
def test_feature_pairing(self): sess = tf.Session() fgsm = FastGradientMethod(self.model, sess=sess) def attack(x): return fgsm.generate(x) loss = FeaturePairing(self.model, weight=0.1, attack=attack) l = loss.fprop(self.x, self.y) vl1 = sess.run(l, feed_dict={self.x: self.vx, self.y: self.vy}) vl2 = sess.run(l, feed_dict={self.x: self.vx, self.y: self.vy}) self.assertClose(vl1, sum([4.296023369, 2.963884830]) / 2., atol=1e-6) self.assertClose(vl2, sum([4.296023369, 2.963884830]) / 2., atol=1e-6) loss = FeaturePairing(self.model, weight=10., attack=attack) l = loss.fprop(self.x, self.y) vl1 = sess.run(l, feed_dict={self.x: self.vx, self.y: self.vy}) vl2 = sess.run(l, feed_dict={self.x: self.vx, self.y: self.vy}) self.assertClose(vl1, sum([4.333082676, 3.00094414]) / 2., atol=1e-6) self.assertClose(vl2, sum([4.333082676, 3.00094414]) / 2., atol=1e-6)
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, train_dir=TRAIN_DIR, filename=FILENAME, load_model=LOAD_MODEL, testing=False, label_smoothing=0.1): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ tf.keras.backend.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if keras.backend.image_data_format() != 'channels_last': raise NotImplementedError( "this tutorial requires keras to be configured to channels_last format" ) # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir): os.mkdir(train_dir) ckpt = tf.train.get_checkpoint_state(train_dir) print(train_dir, ckpt) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path wrap = KerasModelWrapper(model) if load_model and ckpt_path: saver = tf.train.Saver() print(ckpt_path) saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") loss = CrossEntropy(wrap, smoothing=label_smoothing) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate(x, **fgsm_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculating train error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_train, y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) wrap_2 = KerasModelWrapper(model_2) preds_2 = model_2(x) fgsm2 = FastGradientMethod(wrap_2, sess=sess) def attack(x): return fgsm2.generate(x, **fgsm_params) preds_2_adv = model_2(attack(x)) loss_2 = CrossEntropy(wrap_2, smoothing=label_smoothing, attack=attack) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, x_test, y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, x_test, y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training train(sess, loss_2, x_train, y_train, evaluate=evaluate_2, args=train_params, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, x_train, y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, x_train, y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def mnist_tutorial(nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, train_end=-1, test_end=-1, learning_rate=LEARNING_RATE): """ MNIST cleverhans tutorial :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Train a pytorch MNIST model torch_model = PytorchMnistModel() if torch.cuda.is_available(): torch_model = torch_model.cuda() report = AccuracyReport() train_loader = torch.utils.data.DataLoader(datasets.MNIST( 'data', train=True, download=True, transform=transforms.ToTensor()), batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(datasets.MNIST( 'data', train=False, transform=transforms.ToTensor()), batch_size=batch_size) # Truncate the datasets so that our test run more quickly train_loader.dataset.train_data = train_loader.dataset.train_data[: train_end] test_loader.dataset.test_data = test_loader.dataset.test_data[:test_end] # Train our model optimizer = optim.Adam(torch_model.parameters(), lr=learning_rate) train_loss = [] total = 0 correct = 0 step = 0 for _epoch in range(nb_epochs): for xs, ys in train_loader: xs, ys = Variable(xs), Variable(ys) if torch.cuda.is_available(): xs, ys = xs.cuda(), ys.cuda() optimizer.zero_grad() preds = torch_model(xs) loss = F.nll_loss(preds, ys) loss.backward() # calc gradients train_loss.append(loss.data.item()) optimizer.step() # update gradients preds_np = preds.cpu().detach().numpy() correct += (np.argmax(preds_np, axis=1) == ys.cpu().detach().numpy()).sum() total += train_loader.batch_size step += 1 if total % 1000 == 0: acc = float(correct) / total print('[%s] Training accuracy: %.2f%%' % (step, acc * 100)) total = 0 correct = 0 # Evaluate on clean data total = 0 correct = 0 for xs, ys in test_loader: xs, ys = Variable(xs), Variable(ys) if torch.cuda.is_available(): xs, ys = xs.cuda(), ys.cuda() preds = torch_model(xs) preds_np = preds.cpu().detach().numpy() correct += (np.argmax(preds_np, axis=1) == ys.cpu().detach().numpy()).sum() total += len(xs) acc = float(correct) / total report.clean_train_clean_eval = acc print('[%s] Clean accuracy: %.2f%%' % (step, acc * 100)) # We use tf for evaluation on adversarial data sess = tf.Session() x_op = tf.placeholder(tf.float32, shape=( None, 1, 28, 28, )) # Convert pytorch model to a tf_model and wrap it in cleverhans tf_model_fn = convert_pytorch_model_to_tf(torch_model) cleverhans_model = CallableModelWrapper(tf_model_fn, output_layer='logits') # Create an FGSM attack fgsm_op = FastGradientMethod(cleverhans_model, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x_op = fgsm_op.generate(x_op, **fgsm_params) adv_preds_op = tf_model_fn(adv_x_op) # Run an evaluation of our model against fgsm total = 0 correct = 0 for xs, ys in test_loader: adv_preds = sess.run(adv_preds_op, feed_dict={x_op: xs}) correct += (np.argmax(adv_preds, axis=1) == ys.cpu().detach().numpy()).sum() total += test_loader.batch_size acc = float(correct) / total print('Adv accuracy: {:.3f}'.format(acc * 100)) report.clean_train_adv_eval = acc return report
def evaluate_model(filepath, train_start=0, train_end=60000, test_start=0, test_end=10000, batch_size=128, testing=False, num_threads=None): """ Run evaluation on a saved model :param filepath: path to model to evaluate :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param batch_size: size of evaluation batches """ # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.INFO) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) eval_params = {'batch_size': batch_size} fgsm_params = { 'eps': 0.3, 'clip_min': 0., 'clip_max': 1. } def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) with sess.as_default(): model = load(filepath) assert len(model.get_params()) > 0 # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_logits(adv_x) preds = model.get_logits(x) # Evaluate the accuracy of the MNIST model on adversarial examples do_eval(preds, x_test, y_test, 'train_clean_train_clean_eval', False) do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True)
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, testing=False, label_smoothing=0.1): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param testing: if true, training error is calculated :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Force TensorFlow to use single thread to improve reproducibility config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) if keras.backend.image_data_format() != 'channels_last': raise NotImplementedError( "this tutorial requires keras to be configured to channels_last format" ) # Create TF session and set as Keras backend session sess = tf.Session(config=config) keras.backend.set_session(sess) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Label smoothing y_train -= label_smoothing * (y_train - 1. / nb_classes) # Define Keras model model = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) print("Defined Keras model.") # To be able to call the model in the custom loss, we need to call it once # before, see https://github.com/tensorflow/tensorflow/issues/23769 model(model.input) # Initialize the Fast Gradient Sign Method (FGSM) attack object wrap = KerasModelWrapper(model) fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_acc_metric = get_adversarial_acc_metric(model, fgsm, fgsm_params) model.compile(optimizer=keras.optimizers.Adam(learning_rate), loss='categorical_crossentropy', metrics=['accuracy', adv_acc_metric]) # Train an MNIST model model.fit(x_train, y_train, batch_size=batch_size, epochs=nb_epochs, validation_data=(x_test, y_test), verbose=2) # Evaluate the accuracy on legitimate and adversarial test examples _, acc, adv_acc = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=0) report.clean_train_clean_eval = acc report.clean_train_adv_eval = adv_acc print('Test accuracy on legitimate examples: %0.4f' % acc) print('Test accuracy on adversarial examples: %0.4f\n' % adv_acc) # Calculate training error if testing: _, train_acc, train_adv_acc = model.evaluate(x_train, y_train, batch_size=batch_size, verbose=0) report.train_clean_train_clean_eval = train_acc report.train_clean_train_adv_eval = train_adv_acc print("Repeating the process, using adversarial training") # Redefine Keras model model_2 = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) model_2(model_2.input) wrap_2 = KerasModelWrapper(model_2) fgsm_2 = FastGradientMethod(wrap_2, sess=sess) # Use a loss function based on legitimate and adversarial examples adv_loss_2 = get_adversarial_loss(model_2, fgsm_2, fgsm_params) adv_acc_metric_2 = get_adversarial_acc_metric(model_2, fgsm_2, fgsm_params) model_2.compile(optimizer=keras.optimizers.Adam(learning_rate), loss=adv_loss_2, metrics=['accuracy', adv_acc_metric_2]) # Train an MNIST model model_2.fit(x_train, y_train, batch_size=batch_size, epochs=nb_epochs, validation_data=(x_test, y_test), verbose=2) # Evaluate the accuracy on legitimate and adversarial test examples _, acc, adv_acc = model_2.evaluate(x_test, y_test, batch_size=batch_size, verbose=0) report.adv_train_clean_eval = acc report.adv_train_adv_eval = adv_acc print('Test accuracy on legitimate examples: %0.4f' % acc) print('Test accuracy on adversarial examples: %0.4f\n' % adv_acc) # Calculate training error if testing: _, train_acc, train_adv_acc = model_2.evaluate(x_train, y_train, batch_size=batch_size, verbose=0) report.train_adv_train_clean_eval = train_acc report.train_adv_train_adv_eval = train_adv_acc return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, clean_train=CLEAN_TRAIN, testing=False, backprop_through_attack=BACKPROP_THROUGH_ATTACK, nb_filters=NB_FILTERS, num_threads=None, label_smoothing=0.1): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) def do_eval(preds, x_set, y_set, report_key, is_adv=None): """ Run the evaluation and print the results. """ acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) if clean_train: model = make_basic_picklable_cnn() # Tag the model so that when it is saved to disk, future scripts will # be able to tell what data it was trained on model.dataset_factory = mnist.get_factory() preds = model.get_logits(x) assert len(model.get_params()) > 0 loss = CrossEntropy(model, smoothing=label_smoothing) def evaluate(): """ Run evaluation for the naively trained model on clean examples. """ do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) with sess.as_default(): save("clean_model.joblib", model) print("Now that the model has been saved, you can evaluate it in a" " separate process using `evaluate_pickled_model.py`. " "You should get exactly the same result for both clean and " "adversarial accuracy as you get within this program.") # Calculate training error if testing: do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval') # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_logits(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True) # Calculate training error if testing: do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval') print('Repeating the process, using adversarial training') # Create a new model and train it to be robust to FastGradientMethod model2 = make_basic_picklable_cnn() # Tag the model so that when it is saved to disk, future scripts will # be able to tell what data it was trained on model2.dataset_factory = mnist.get_factory() fgsm2 = FastGradientMethod(model2, sess=sess) def attack(x): """Return an adversarial example near clean example `x`""" return fgsm2.generate(x, **fgsm_params) loss2 = CrossEntropy(model2, smoothing=label_smoothing, attack=attack) preds2 = model2.get_logits(x) adv_x2 = attack(x) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x2 = tf.stop_gradient(adv_x2) preds2_adv = model2.get_logits(adv_x2) def evaluate_adv(): """ Evaluate the adversarially trained model. """ # Accuracy of adversarially trained model on legitimate test inputs do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False) # Accuracy of the adversarially trained model on adversarial examples do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True) # Perform and evaluate adversarial training train(sess, loss2, x_train, y_train, evaluate=evaluate_adv, args=train_params, rng=rng, var_list=model2.get_params()) with sess.as_default(): save("adv_model.joblib", model2) print( "Now that the model has been saved, you can evaluate it in a " "separate process using " "`python evaluate_pickled_model.py adv_model.joblib`. " "You should get exactly the same result for both clean and " "adversarial accuracy as you get within this program." " You can also move beyond the tutorials directory and run the " " real `compute_accuracy.py` script (make sure cleverhans/scripts " "is in your PATH) to see that this FGSM-trained " "model is actually not very robust---it's just a model that trains " " quickly so the tutorial does not take a long time") # Calculate training errors if testing: do_eval(preds2, x_train, y_train, 'train_adv_train_clean_eval') do_eval(preds2_adv, x_train, y_train, 'train_adv_train_adv_eval') return report
def dknn_tutorial(): # Get MNIST data. mnist = MNIST() x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Use Image Parameters. img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] with tf.Session() as sess: with tf.variable_scope('dknn'): # Define input TF placeholder. x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define a model. model = make_basic_picklable_cnn() preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.) # Define the test set accuracy evaluation. def evaluate(): acc = model_eval(sess, x, y, preds, x_test, y_test, args={'batch_size': FLAGS.batch_size}) print('Test accuracy on test examples: %0.4f' % acc) # Train the model train_params = { 'nb_epochs': FLAGS.nb_epochs, 'batch_size': FLAGS.batch_size, 'learning_rate': FLAGS.lr } train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, var_list=model.get_params()) # Define callable that returns a dictionary of all activations for a dataset def get_activations(data): data_activations = {} for layer in layers: layer_sym = tf.layers.flatten(model.get_layer(x, layer)) data_activations[layer] = batch_eval( sess, [x], [layer_sym], [data], args={'batch_size': FLAGS.batch_size})[0] return data_activations # Use a holdout of the test set to simulate calibration data for the DkNN. train_data = x_train train_labels = np.argmax(y_train, axis=1) cali_data = x_test[:FLAGS.nb_cali] y_cali = y_test[:FLAGS.nb_cali] cali_labels = np.argmax(y_cali, axis=1) test_data = x_test[FLAGS.nb_cali:] y_test = y_test[FLAGS.nb_cali:] # Extract representations for the training and calibration data at each layer of interest to the DkNN. layers = ['ReLU1', 'ReLU3', 'ReLU5', 'logits'] # Wrap the model into a DkNNModel dknn = DkNNModel(FLAGS.neighbors, layers, get_activations, train_data, train_labels, nb_classes, scope='dknn') dknn.calibrate(cali_data, cali_labels) # Generate adversarial examples fgsm = FastGradientMethod(model, sess=sess) attack_params = {'eps': .25, 'clip_min': 0., 'clip_max': 1.} adv = sess.run(fgsm.generate(x, **attack_params), feed_dict={x: test_data}) # Test the DkNN on clean test data and FGSM test data for data_in, fname in zip([test_data, adv], ['test', 'adv']): dknn_preds = dknn.fprop_np(data_in) print(dknn_preds.shape) print( np.mean( np.argmax(dknn_preds, axis=1) == np.argmax(y_test, axis=1))) plot_reliability_diagram(dknn_preds, np.argmax(y_test, axis=1), '/tmp/dknn_' + fname + '.pdf') return True
def setUp(self): super(TestFastGradientMethod, self).setUp() self.attack = FastGradientMethod(self.model, sess=self.sess)
def test_generate_respects_dtype(self): self.attack = FastGradientMethod(self.model, sess=self.sess, dtypestr='float64') x = tf.placeholder(dtype=tf.float64, shape=(100, 2)) x_adv = self.attack.generate(x) self.assertEqual(x_adv.dtype, tf.float64)
class CommonAttackProperties(CleverHansTest): """ Abstract base class shared by the tessts for many attacks that want to check the same properties. """ def setUp(self): # Inheritance doesn't really work with tests. # nosetests always wants to run this class because it is a # CleverHansTest subclass, but this class is meant to just # be abstract. # Before this class was the tests for FastGradientMethod but # people kept inheriting from it for other attacks so it was # impossible to write tests specifically for FastGradientMethod. # pylint: disable=unidiomatic-typecheck if type(self) is CommonAttackProperties: raise SkipTest() super(CommonAttackProperties, self).setUp() self.sess = tf.Session() self.model = SimpleModel() def generate_adversarial_examples_np(self, ord, eps, **kwargs): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, eps=eps, ord=ord, clip_min=-5, clip_max=5, **kwargs) if ord == np.inf: delta = np.max(np.abs(x_adv - x_val), axis=1) elif ord == 1: delta = np.sum(np.abs(x_adv - x_val), axis=1) elif ord == 2: delta = np.sum(np.square(x_adv - x_val), axis=1) ** .5 return x_val, x_adv, delta def help_generate_np_gives_adversarial_example(self, ord, eps=.5, **kwargs): x_val, x_adv, delta = self.generate_adversarial_examples_np(ord, eps, **kwargs) self.assertLess(np.max(np.abs(delta-eps)), 1e-3) orig_labs = np.argmax(self.sess.run(self.model.get_logits(x_val)), axis=1) new_labs = np.argmax(self.sess.run(self.model.get_logits(x_adv)), axis=1) self.assertLess(np.max(np.mean(orig_labs == new_labs)), .5) def test_invalid_input(self): x_val = -np.ones((2, 2), dtype='float32') with self.assertRaises(tf.errors.InvalidArgumentError) as context: self.attack.generate_np(x_val, eps=1., clip_min=0., clip_max=1.) self.assertTrue(context.exception) def test_generate_np_gives_adversarial_example_linfinity(self): self.help_generate_np_gives_adversarial_example(np.infty) def test_generate_np_gives_adversarial_example_l1(self): self.help_generate_np_gives_adversarial_example(1) def test_generate_np_gives_adversarial_example_l2(self): self.help_generate_np_gives_adversarial_example(2) def test_generate_respects_dtype(self): self.attack = FastGradientMethod(self.model, sess=self.sess, dtypestr='float64') x = tf.placeholder(dtype=tf.float64, shape=(100, 2)) x_adv = self.attack.generate(x) self.assertEqual(x_adv.dtype, tf.float64) def test_targeted_generate_np_gives_adversarial_example(self): random_labs = np.random.random_integers(0, 1, 100) random_labs_one_hot = np.zeros((100, 2)) random_labs_one_hot[np.arange(100), random_labs] = 1 try: _, x_adv, delta = self.generate_adversarial_examples_np( eps=.5, ord=np.inf, y_target=random_labs_one_hot) except NotImplementedError: raise SkipTest() self.assertLessEqual(np.max(delta), 0.5001) new_labs = np.argmax(self.sess.run(self.model.get_logits(x_adv)), axis=1) self.assertTrue(np.mean(random_labs == new_labs) > 0.7) def test_generate_np_can_be_called_with_different_eps(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) for eps in [0.1, 0.2, 0.3, 0.4]: x_adv = self.attack.generate_np(x_val, eps=eps, ord=np.inf, clip_min=-5.0, clip_max=5.0) delta = np.max(np.abs(x_adv - x_val), axis=1) self.assertLessEqual(np.max(delta), eps+1e-4) def test_generate_can_be_called_with_different_eps(self): # It is crtical that this test uses generate and not generate_np. # All the other tests use generate_np. Even though generate_np calls # generate, it does so in a very standardized way, e.g. with eps # always converted to a tensorflow placeholder, so the other tests # based on generate_np do not exercise the generate API very well. x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x = tf.placeholder(tf.float32, x_val.shape) for eps in [0.1, 0.2, 0.3, 0.4]: x_adv = self.attack.generate(x, eps=eps, ord=np.inf, clip_min=-5.0, clip_max=5.0) x_adv = self.sess.run(x_adv, feed_dict={x: x_val}) delta = np.max(np.abs(x_adv - x_val), axis=1) self.assertLessEqual(np.max(delta), eps + 1e-4) def test_generate_np_clip_works_as_expected(self): x_val = np.random.rand(100, 2) x_val = np.array(x_val, dtype=np.float32) x_adv = self.attack.generate_np(x_val, eps=0.5, ord=np.inf, clip_min=-0.2, clip_max=0.1, sanity_checks=False) self.assertClose(np.min(x_adv), -0.2) self.assertClose(np.max(x_adv), 0.1)
# Load pairs of faces and their labels in one-hot encoding faces1, faces2, labels = set_loader.load_testset(1000) # Create victims' embeddings using Facenet itself graph = tf.get_default_graph() phase_train_placeholder = graph.get_tensor_by_name("phase_train:0") feed_dict = {model.face_input: faces2, phase_train_placeholder: False} victims_embeddings = sess.run( model.embedding_output, feed_dict=feed_dict) # Define FGSM for the model steps = 1 eps = 0.01 alpha = eps / steps fgsm = FastGradientMethod(model) fgsm_params = {'eps': alpha, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate(model.face_input, **fgsm_params) # Run FGSM adv = faces1 for i in range(steps): print("FGSM step " + str(i + 1)) feed_dict = {model.face_input: adv, model.victim_embedding_input: victims_embeddings, phase_train_placeholder: False} adv = sess.run(adv_x, feed_dict=feed_dict) # Test accuracy of the model
def mnist_blackbox(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_classes=NB_CLASSES, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, nb_epochs=NB_EPOCHS, holdout=HOLDOUT, data_aug=DATA_AUG, nb_epochs_s=NB_EPOCHS_S, lmbda=LMBDA, aug_batch_size=AUG_BATCH_SIZE): """ MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697 :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :return: a dictionary with: * black-box model accuracy on test set * substitute model accuracy on test set * black-box model accuracy on adversarial examples transferred from the substitute model """ # Set logging level to see debug information set_log_level(logging.DEBUG) # Dictionary used to keep track and return key accuracies accuracies = {} # Perform tutorial setup assert setup_tutorial() # Create TF session sess = tf.Session() # Get MNIST data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Initialize substitute training set reserved for adversary x_sub = x_test[:holdout] y_sub = np.argmax(y_test[:holdout], axis=1) # Redefine test set as remaining samples unavailable to adversaries x_test = x_test[holdout:] y_test = y_test[holdout:] # Obtain Image parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Seed random number generator so tutorial is reproducible rng = np.random.RandomState([2017, 8, 30]) # Simulate the black-box model locally # You could replace this by a remote labeling API for instance print("Preparing the black-box model.") prep_bbox_out = prep_bbox(sess, x, y, x_train, y_train, x_test, y_test, nb_epochs, batch_size, learning_rate, rng, nb_classes, img_rows, img_cols, nchannels) model, bbox_preds, accuracies['bbox'] = prep_bbox_out # Train substitute using method from https://arxiv.org/abs/1602.02697 print("Training the substitute model.") train_sub_out = train_sub(sess, x, y, bbox_preds, x_sub, y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda, aug_batch_size, rng, img_rows, img_cols, nchannels) model_sub, preds_sub = train_sub_out # Evaluate the substitute model on clean test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_sub, x_test, y_test, args=eval_params) accuracies['sub'] = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object. fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model_sub, sess=sess) # Craft adversarial examples using the substitute eval_params = {'batch_size': batch_size} x_adv_sub = fgsm.generate(x, **fgsm_par) # Evaluate the accuracy of the "black-box" model on adversarial examples accuracy = model_eval(sess, x, y, model.get_logits(x_adv_sub), x_test, y_test, args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute: ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex'] = accuracy return accuracies
def attack(model, session, a): fgsm = FastGradientMethod(model, sess=session) image = a.original_image[np.newaxis] return fgsm.generate_np(image)
def cifar10_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, clean_train=CLEAN_TRAIN, testing=False, backprop_through_attack=BACKPROP_THROUGH_ATTACK, nb_filters=NB_FILTERS, num_threads=None, label_smoothing=0.1): """ CIFAR10 cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get CIFAR10 data data = CIFAR10(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) dataset_size = data.x_train.shape[0] dataset_train = data.to_tensorflow()[0] dataset_train = dataset_train.map( lambda x, y: (random_shift(random_horizontal_flip(x)), y), 4) dataset_train = dataset_train.batch(batch_size) dataset_train = dataset_train.prefetch(16) x_train, y_train = data.get_set('train') x_test, y_test = data.get_set('test') # Use Image Parameters img_rows, img_cols, nchannels = x_test.shape[1:4] nb_classes = y_test.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) if clean_train: model = ModelAllConvolutional('model1', nb_classes, nb_filters, input_shape=[32, 32, 3]) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=label_smoothing) def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) train(sess, loss, None, None, dataset_train=dataset_train, dataset_size=dataset_size, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) # Calculate training error if testing: do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval') # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_logits(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True) # Calculate training error if testing: do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval') print('Repeating the process, using adversarial training') # Create a new model and train it to be robust to FastGradientMethod model2 = ModelAllConvolutional('model2', nb_classes, nb_filters, input_shape=[32, 32, 3]) fgsm2 = FastGradientMethod(model2, sess=sess) def attack(x): return fgsm2.generate(x, **fgsm_params) loss2 = CrossEntropy(model2, smoothing=label_smoothing, attack=attack) preds2 = model2.get_logits(x) adv_x2 = attack(x) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x2 = tf.stop_gradient(adv_x2) preds2_adv = model2.get_logits(adv_x2) def evaluate2(): # Accuracy of adversarially trained model on legitimate test inputs do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False) # Accuracy of the adversarially trained model on adversarial examples do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True) # Perform and evaluate adversarial training train(sess, loss2, None, None, dataset_train=dataset_train, dataset_size=dataset_size, evaluate=evaluate2, args=train_params, rng=rng, var_list=model2.get_params()) # Calculate training errors if testing: do_eval(preds2, x_train, y_train, 'train_adv_train_clean_eval') do_eval(preds2_adv, x_train, y_train, 'train_adv_train_adv_eval') return report
def main(argv): model_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) if model_file is None: print('No model found') sys.exit() cifar = cifar10_input.CIFAR10Data(FLAGS.dataset_dir) nb_classes = 10 X_test = cifar.eval_data.xs Y_test = to_categorical(cifar.eval_data.ys, nb_classes) assert Y_test.shape[1] == 10. set_log_level(logging.DEBUG) with tf.Session() as sess: x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) y = tf.placeholder(tf.float32, shape=(None, 10)) from src.FGSM.cleverhans.cleverhans.model_zoo.madry_lab_challenges.cifar10_model import make_wresnet model = make_wresnet() saver = tf.train.Saver() # Restore the checkpoint saver.restore(sess, model_file) nb_samples = FLAGS.nb_samples attack_params = { 'batch_size': FLAGS.batch_size, 'clip_min': 0., 'clip_max': 255. } if FLAGS.attack_type == 'cwl2': from src.FGSM.cleverhans.cleverhans.attacks import CarliniWagnerL2 attacker = CarliniWagnerL2(model, sess=sess) attack_params.update({ 'binary_search_steps': 1, 'max_iterations': 100, 'learning_rate': 0.1, 'initial_const': 10, 'batch_size': 10 }) else: # eps and eps_iter in range 0-255 attack_params.update({'eps': 8, 'ord': np.inf}) if FLAGS.attack_type == 'fgsm': from src.FGSM.cleverhans.cleverhans.attacks import FastGradientMethod attacker = FastGradientMethod(model, sess=sess) elif FLAGS.attack_type == 'pgd': attack_params.update({'eps_iter': 2, 'nb_iter': 20}) from src.FGSM.cleverhans.cleverhans.attacks import MadryEtAl attacker = MadryEtAl(model, sess=sess) eval_par = {'batch_size': FLAGS.batch_size} if FLAGS.sweep: max_eps = 16 epsilons = np.linspace(1, max_eps, max_eps) for e in epsilons: t1 = time.time() attack_params.update({'eps': e}) x_adv = attacker.generate(x, **attack_params) preds_adv = model.get_probs(x_adv) acc = model_eval(sess, x, y, preds_adv, X_test[:nb_samples], Y_test[:nb_samples], args=eval_par) print('Epsilon %.2f, accuracy on adversarial' % e, 'examples %0.4f\n' % acc) t2 = time.time() else: t1 = time.time() x_adv = attacker.generate(x, **attack_params) preds_adv = model.get_probs(x_adv) acc = model_eval(sess, x, y, preds_adv, X_test[:nb_samples], Y_test[:nb_samples], args=eval_par) t2 = time.time() print('Test accuracy on adversarial examples %0.4f\n' % acc) print("Took", t2 - t1, "seconds")