def model_test(attack): """ Evaluates the performances of the model over the original MNIST test set and the augmented set. Parameters ---------- attack: str The augmented dataset used (either "jsma", "wjsma" or "tjsma"). """ mnist = MNIST(train_start=TRAIN_START, train_end=TRAIN_END, test_start=TEST_START, test_end=TEST_END) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') print("ORIGINAL MNIST TEST") model_testing("mnist_defense_" + attack + ".joblib", x_train, y_train, x_test, y_test) x_add = np.load("defense/augmented/" + attack + "_x.npy")[:AUGMENT_SIZE] y_add = np.load("defense/augmented/" + attack + "_y.npy")[:AUGMENT_SIZE] x_train = np.concatenate((x_train, x_add.reshape(x_add.shape + (1,))), axis=0).astype(np.float32) y_train = np.concatenate((y_train, y_add), axis=0).astype(np.float32) print("====================") print("AUGMENTED MNIST TEST") model_testing("mnist_defense_" + attack + ".joblib", x_train, y_train, x_test, y_test)
def load_mnist(self): """Load the training data (MNIST). """ # Get MNIST data train_start, train_end = 0, 60000 test_start, test_end = 0, 10000 mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) self._x_train, self._y_train = mnist.get_set('train') self._x_test, self._y_test = mnist.get_set('test') # Use Image Parameters self._img_rows, self._img_cols, self._nchannels = \ self._x_train.shape[1:4] self._nb_classes = self._y_train.shape[1] print(f"len(train): {len(self._x_train)} / {len(self._y_train)}") print(f"len(test): {len(self._x_test)} / {len(self._y_test)}") print( f"img_rows x img_cols x nchannels: {self._img_rows} x {self._img_cols} x {self._nchannels}" ) print(f"nb_classes: {self._nb_classes}")
def get_data(self, train_start, train_end, test_start, test_end): mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) self.x_train, self.y_train = mnist.get_set('train') self.x_test, self.y_test = mnist.get_set('test') self.img_rows, self.img_cols, self.nchannels = self.x_train.shape[1:4] self.nb_classes = self.y_train.shape[1]
def get_data(self, train_start, train_end, test_start, test_end, s0): mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) self.x_train, self.y_train = mnist.get_set('train') self.x_test, self.y_test = mnist.get_set('test') self.img_rows, self.img_cols, self.nchannels = self.x_train.shape[1:4] self.nb_classes = self.y_train.shape[1] #print('^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^') #print(self.x_train.shape, self.y_train.shape, self.x_test.shape, self.y_test.shape) self.x_sub = self.x_test[:s0] self.y_sub = np.argmax(self.y_test[:s0], axis=1) self.x_test = self.x_test[s0:] self.y_test = self.y_test[s0:]
def model_test(file_name=FILE_NAME): """ Evaluates the performances of the model over the MNIST dataset. Parameters ---------- file_name: str, optional The name of the joblib file. """ mnist = MNIST(train_start=0, train_end=60000, test_start=0, test_end=10000) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') model_testing(file_name, x_train, y_train, x_test, y_test)
def __test(): # report = AccuracyReport() tf.set_random_seed(1234) sess = tf.Session() set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST(train_start=0, train_end=60000, test_start=0, test_end=10000) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") # Train an MNIST model train_params = { 'nb_epochs': NB_EPOCHS, 'batch_size': BATCH_SIZE, 'learning_rate': LEARNING_RATE, 'filename': os.path.split(MODEL_PATH)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': BATCH_SIZE} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
def ATTACK(attack,dataset,first_index,settype, last_index, batch_size): """ Applies the saliency map attack against the specified model. Parameters ---------- model: str The name of the model used. attack: str The type of used attack (either "jsma", "wjsma" or "tjsma"). set_type: str The type of set used (either "train" or "test"). first_index: The index of the first image attacked. last_index: int The index of the last image attacked. batch_size: int The size of the image batches. """ if dataset == 'mnist': from cleverhans.dataset import MNIST x_set, y_set = MNIST(train_start=0, train_end=60000, test_start=0, test_end=10000).get_set(settype) print(x_set.shape) gamma = 0.155 file_path="/models/mnist" #elif model in CIFAR10_SETS: else: #from cleverhans.dataset import CIFAR10 #x_set, y_set = CIFAR10(train_start=0, train_end=50000, test_start=0, test_end=10000).get_set(settype) #gamma = 0.155 from setup_cifar import CIFAR data = CIFAR() x_set,y_set = data.test_data,data.test_labels print(x_set.shape) print(y_set) gamma = 0.155 file_path="./Least_pixel_attack/models/cifar" #else: # raise ValueError("Invalid model: " + model) generate_attacks( save_path="./Least_pixel_attack/models/data", file_path=file_path, dataset = dataset, x_set=x_set, y_set=y_set, attack=attack, gamma=gamma, first_index=first_index, last_index=last_index, batch_size=batch_size )
def train_ae(num_epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, testing=False, learning_rate=LEARNING_RATE): # can use gpu config = tf.ConfigProto(device_count={'GPU': 1, 'CPU': 1}) # Create TF session and set Keras backend session as TF sess = tf.Session(config=config) keras.backend.set_session(sess) # Get MNIST test data mnist = MNIST() x_train, y_train = mnist.get_set("train") x_test, y_test = mnist.get_set("test") # Obtain image params n_rows, n_cols, n_channels = x_train.shape[1:4] n_classes = y_train.shape[1] # define TF model graph model = DenoisingAutoencoder((n_rows, n_cols, n_channels)) model.compile(optimizer=keras.optimizers.Adam(learning_rate), loss='mse') # Train an MNIST model model.fit(x_train, x_train, batch_size=batch_size, epochs=num_epochs, validation_data=(x_test, x_test), verbose=1) # Evaluate the accuracy on legitimate and adversarial test examples x_test_recon = model.predict(x_test, batch_size=batch_size, verbose=0) # Save model locally keras.models.save_model(model, f"{MODEL_PATH}/autoencoder.hdf5", overwrite=True, include_optimizer=True)
def model_train(file_name=FILE_NAME): """ Creates the joblib file of LeNet-5 trained over the MNIST dataset. Parameters ---------- file_name: str, optional The name of the joblib file. """ layers = [ Conv2D(20, (5, 5), (1, 1), "VALID"), ReLU(), MaxPooling2D((2, 2), (2, 2), "VALID"), Conv2D(50, (5, 5), (1, 1), "VALID"), ReLU(), MaxPooling2D((2, 2), (2, 2), "VALID"), Flatten(), Linear(500), ReLU(), Linear(10), Softmax() ] model = MLP(layers, (None, 28, 28, 1)) mnist = MNIST(train_start=0, train_end=60000, test_start=0, test_end=10000) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') model_training(model, file_name, x_train, y_train, x_test, y_test, nb_epochs=20, batch_size=128, learning_rate=0.001)
def model_train(attack): """ Creates the joblib file of LeNet-5 trained over the augmented MNIST dataset. Parameters ---------- attack: str The augmented dataset used (either "jsma", "wjsma" or "tjsma"). """ layers = [ Conv2D(20, (5, 5), (1, 1), "VALID"), ReLU(), MaxPooling2D((2, 2), (2, 2), "VALID"), Conv2D(50, (5, 5), (1, 1), "VALID"), ReLU(), MaxPooling2D((2, 2), (2, 2), "VALID"), Flatten(), Linear(500), ReLU(), Linear(10), Softmax() ] model = MLP(layers, (None, 28, 28, 1)) mnist = MNIST(train_start=TRAIN_START, train_end=TRAIN_END, test_start=TEST_START, test_end=TEST_END) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') x_add = np.load("defense/augmented/" + attack + "_x.npy")[:AUGMENT_SIZE] y_add = np.load("defense/augmented/" + attack + "_y.npy")[:AUGMENT_SIZE] x_train = np.concatenate((x_train, x_add.reshape(x_add.shape + (1,))), axis=0).astype(np.float32) y_train = np.concatenate((y_train, y_add), axis=0).astype(np.float32) model_training(model, "mnist_defense_" + attack + ".joblib", x_train, y_train, x_test, y_test, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE)
def save_images(model, attack, set_type, first_index, last_index): """ Applies the saliency map attack against the specified model. Parameters ---------- model: str The name of the model used. attack: str The type of used attack (either "jsma", "wjsma" or "tjsma"). set_type: str The type of set used (either "train" or "test"). first_index: The index of the first image attacked. last_index: int The index of the last image attacked. """ if model in MNIST_SETS: from cleverhans.dataset import MNIST x_set, y_set = MNIST(train_start=0, train_end=60000, test_start=0, test_end=10000).get_set(set_type) gamma = 0.155 elif model in CIFAR10_SETS: from cleverhans.dataset import CIFAR10 x_set, y_set = CIFAR10(train_start=0, train_end=50000, test_start=0, test_end=10000).get_set(set_type) y_set = y_set.reshape((y_set.shape[0], 10)) gamma = 0.039 else: raise ValueError("Invalid model: " + model) generate_attacks(save_path="attack/" + model + "/" + attack + "_" + set_type, file_path="models/joblibs/" + model + ".joblib", x_set=x_set, y_set=y_set, attack=attack, gamma=gamma, first_index=first_index, last_index=last_index)
import os import tensorflow as tf from tensorflow import keras import numpy as np from cleverhans.dataset import MNIST from keras.models import Sequential from keras.layers import Dense, Dropout, Flatten from keras.layers import Conv2D, MaxPooling2D from keras import backend as K # Get MNIST test data mnist = MNIST(train_start=0, train_end=10000, test_start=0, test_end=10000) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Get input for adversarial examples #x_new = x_train[:1000, :, :, :] y = y_train[:350] print(y.shape) folder = 'test' #print(folder) path = "/home/dinhtv/code/adversarial-images/igsm/%s/%d" for i in range(0, 10): os.makedirs(path % (folder, i)) label = [None] * 100
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=NB_FILTERS, num_threads=None, attack_string=None): """ MNIST cleverhans tutorial :param train_start: index of first training set example. :param train_end: index of last training set example. :param test_start: index of first test set example. :param test_end: index of last test set example. :param nb_epochs: number of epochs to train model. :param batch_size: size of training batches. :param learning_rate: learning rate for training. :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate. :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param nb_filters: number of filters in the CNN used for training. :param num_threads: number of threads used for running the process. :param attack_string: attack name for crafting adversarial attacks and adversarial training, in string format. :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) X_train, Y_train = mnist.get_set('train') X_test, Y_test = mnist.get_set('test') # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } # Initialize the attack object attack_class = attack_selection(attack_string) attack_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2018, 6, 18]) if clean_train: model = ModelBasicCNNTFE(nb_filters=nb_filters) def evaluate_clean(): """ Evaluate the accuracy of the MNIST model on legitimate test examples """ eval_params = {'batch_size': batch_size} acc = model_eval(model, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) train(model, X_train, Y_train, evaluate=evaluate_clean, args=train_params, rng=rng, var_list=model.get_params()) if testing: # Calculate training error eval_params = {'batch_size': batch_size} acc = model_eval(model, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} attack = attack_class(model) acc = model_eval( model, X_test, Y_test, args=eval_par, attack=attack, attack_args=attack_params) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculate training error if testing: eval_par = {'batch_size': batch_size} acc = model_eval( model, X_train, Y_train, args=eval_par, attack=attack, attack_args=attack_params) print('Train accuracy on adversarial examples: %0.4f\n' % acc) report.train_clean_train_adv_eval = acc attack = None print("Repeating the process, using adversarial training") model_adv_train = ModelBasicCNNTFE(nb_filters=nb_filters) attack = attack_class(model_adv_train) def evaluate_adv(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval( model_adv_train, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval( model_adv_train, X_test, Y_test, args=eval_params, attack=attack, attack_args=attack_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training train(model_adv_train, X_train, Y_train, evaluate=evaluate_adv, args=train_params, rng=rng, var_list=model_adv_train.get_params(), attack=attack, attack_args=attack_params) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval( model_adv_train, X_train, Y_train, args=eval_params, attack=None, attack_args=None) report.train_adv_train_clean_eval = accuracy accuracy = model_eval( model_adv_train, X_train, Y_train, args=eval_params, attack=attack, attack_args=attack_params) report.train_adv_train_adv_eval = accuracy return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, train_dir=TRAIN_DIR, filename=FILENAME, load_model=LOAD_MODEL, testing=False, label_smoothing=0.1): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session os.environ["CUDA_VISIBLE_DEVICES"] = '0' # only use No.0 GPU config = tf.ConfigProto() config.allow_soft_placement = True config.gpu_options.allow_growth = True sess = tf.Session(config=config) keras.backend.set_session(sess) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir): os.mkdir(train_dir) ckpt = tf.train.get_checkpoint_state(train_dir) print(train_dir, ckpt) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path wrap = KerasModelWrapper(model) if load_model and ckpt_path: saver = tf.train.Saver() print(ckpt_path) saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") loss = CrossEntropy(wrap, smoothing=label_smoothing) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng) saver = tf.train.Saver(max_to_keep=1) saver.save(sess, '{}/mnist.ckpt'.format(train_dir), global_step=NB_EPOCHS) print("model has been saved") # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Basic Iterative Method (BIM) attack object and graph lbfgs = LBFGS(wrap, sess=sess) # targeted attack, targeted class is 1 y_target = np.ones(128) y_target = keras.utils.to_categorical(y_target, num_classes=10) y_target = tf.Variable(y_target) sess.run(tf.global_variables_initializer()) lbfgs_params = {'y_target': y_target, 'batch_size': 128} adv_x = lbfgs.generate(x, **lbfgs_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} start_time = time.time() acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) end_time = time.time() print("L-BFGS attack time is {}".format(end_time - start_time)) report.clean_train_adv_eval = acc # Calculating train error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_train, y_train, args=eval_par) report.train_clean_train_adv_eval = acc gc.collect() return report
def SNNL_example(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, nb_filters=NB_FILTERS, SNNL_factor=SNNL_FACTOR, output_dir=OUTPUT_DIR): """ A simple model trained to minimize Cross Entropy and Maximize Soft Nearest Neighbor Loss at each internal layer. This outputs a TSNE of the sign of the adversarial gradients of a trained model. A model with a negative SNNL_factor will show little or no class clusters, while a model with a 0 SNNL_factor will have class clusters in the adversarial gradient direction. :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param SNNL_factor: multiplier for Soft Nearest Neighbor Loss :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session sess = tf.Session() # Get MNIST data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} rng = np.random.RandomState([2017, 8, 30]) def do_eval(preds, x_set, y_set, report_key): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) print('Test accuracy on legitimate examples: %0.4f' % (acc)) model = ModelBasicCNN('model', nb_classes, nb_filters) preds = model.get_logits(x) cross_entropy_loss = CrossEntropy(model) if not SNNL_factor: loss = cross_entropy_loss else: loss = SNNLCrossEntropy(model, factor=SNNL_factor, optimize_temperature=False) def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval') train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval') def imscatter(points, images, ax=None, zoom=1, cmap="hot"): if ax is None: ax = plt.gca() artists = [] i = 0 if not isinstance(cmap, list): cmap = [cmap] * len(points) for x0, y0 in points: transformed = (images[i] - np.min(images[i])) / \ (np.max(images[i]) - np.min(images[i])) im = OffsetImage(transformed[:, :, 0], zoom=zoom, cmap=cmap[i]) ab = AnnotationBbox(im, (x0, y0), xycoords='data', frameon=False) artists.append(ax.add_artist(ab)) i += 1 ax.update_datalim(np.column_stack(np.transpose(points))) ax.autoscale() ax.get_xaxis().set_ticks([]) ax.get_yaxis().set_ticks([]) return artists adv_grads = tf.sign(tf.gradients(cross_entropy_loss.fprop(x, y), x)) feed_dict = {x: x_test[:batch_size], y: y_test[:batch_size]} adv_grads_val = sess.run(adv_grads, feed_dict=feed_dict) adv_grads_val = np.reshape(adv_grads_val, (batch_size, img_rows * img_cols)) X_embedded = TSNE(n_components=2, verbose=0).fit_transform(adv_grads_val) plt.figure(num=None, figsize=(50, 50), dpi=40, facecolor='w', edgecolor='k') plt.title( "TSNE of Sign of Adv Gradients, SNNLCrossEntropy Model, factor:" + str(FLAGS.SNNL_factor), fontsize=42) imscatter(X_embedded, x_test[:batch_size], zoom=2, cmap="Purples") plt.savefig(output_dir + 'adversarial_gradients_SNNL_factor_' + str(SNNL_factor) + '.png')
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, train_dir=TRAIN_DIR, filename=FILENAME, load_model=LOAD_MODEL, testing=False, label_smoothing=0.1, adversarial_training = ADVERSARIAL_TRAINING, attacking = ATTACKING,origin_method=ORIGIN_METHOD, save_model=SAVE_MODEL,model_type=MODEL_TYPE): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session os.environ["CUDA_VISIBLE_DEVICES"] = '0' # only use No.0 GPU config = tf.ConfigProto() config.allow_soft_placement=True config.gpu_options.allow_growth = True sess = tf.Session(config=config) keras.backend.set_session(sess) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph the_model = modelA if model_type == 'a': the_model = modelA elif model_type == 'b': the_model = modelB elif model_type == 'c': the_model = modelC else: exit('the model type must be a or b or c.') model = the_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) wrap = KerasModelWrapper(model) preds = model(x) # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph if origin_method == 'fgsm': att_method = FastGradientMethod(wrap, sess=sess) att_method_params = {'eps': 0.2, 'clip_min': 0., 'clip_max': 1.} elif origin_method == 'bim': att_method = BasicIterativeMethod(wrap, sess=sess) att_method_params = {'eps': 0.2, 'eps_iter': 0.06, 'nb_iter': 10, 'clip_min': 0., 'clip_max': 1.} elif origin_method == 'mifgsm': att_method = MomentumIterativeMethod(wrap, sess=sess) att_method_params = {'eps': 0.2, 'eps_iter': 0.08, 'nb_iter': 10, 'decay_factor': 0.4, 'clip_min': 0., 'clip_max': 1.} else: exit("the attack method must be fgsm,bim,mifgsm") # Evaluate the accuracy of the MNIST model on adversarial examples print(att_method_params) adv_x = att_method.generate(x, **att_method_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) def attack(x): return att_method.generate(x, **att_method_params) def evaluate2(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc print('AT Test accuracy on legitimate examples: %0.4f' % acc) # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_params) print('AT Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) train_dir = train_dir + '/' + model_type + '/' + origin_method if not os.path.exists(train_dir): os.makedirs(train_dir) ckpt = tf.train.get_checkpoint_state(train_dir) print(train_dir, ckpt) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path if load_model and ckpt_path: saver = tf.train.Saver() print(ckpt_path) saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate2() else: print("Model was not loaded, training from scratch.") loss2 = CrossEntropy(wrap, smoothing=label_smoothing,attack=attack) train(sess, loss2, x_train, y_train, evaluate=evaluate2, args=train_params, rng=rng) if save_model: saver = tf.train.Saver(max_to_keep=1) saver.save(sess, '{}/{}.ckpt'.format(train_dir,origin_method), global_step=NB_EPOCHS) keras.models.save_model(model, '{}/{}_mnist.h5'.format(train_dir,origin_method)) print("model has been saved") # >>> other method >>> if adversarial_training: method = ['fgsm','bim','mifgsm'] for i in range(3): attacking = method[i] if attacking == 'fgsm': att_method = FastGradientMethod(wrap, sess=sess) att_method_params = {'eps': 0.2, 'clip_min': 0., 'clip_max': 1.} elif attacking == 'bim': att_method = BasicIterativeMethod(wrap,sess=sess) att_method_params = {'eps': 0.2, 'eps_iter':0.06, 'nb_iter':10, 'clip_min': 0., 'clip_max': 1.} elif attacking == 'mifgsm': att_method = MomentumIterativeMethod(wrap,sess=sess) att_method_params = {'eps': 0.2, 'eps_iter':0.08, 'nb_iter':10, 'decay_factor':0.4, 'clip_min': 0., 'clip_max': 1.} else: exit("the attack method must be fgsm,bim,mifgsm") # Evaluate the accuracy of the MNIST model on adversarial examples print(att_method_params) adv_x = att_method.generate(x, **att_method_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) eval_par = {'batch_size': batch_size} start_time = time.time() acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f' % acc) end_time = time.time() print("{} attack time is {}\n".format(attacking,end_time-start_time)) report.clean_train_adv_eval = acc gc.collect()
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, clean_train=CLEAN_TRAIN, testing=False, backprop_through_attack=BACKPROP_THROUGH_ATTACK, nb_filters=NB_FILTERS, num_threads=None, label_smoothing=0.1): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {'allow_soft_placement': True} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) if clean_train: model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=label_smoothing) def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) # Calculate training error if testing: do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval') # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_fgsm_x = fgsm.generate(x, **fgsm_params) preds_adv_fgsm = model.get_logits(adv_fgsm_x) # Generate fgsm adversarial examples and save to disk dir = 'images/fgsm_adv/' if not os.path.exists('images'): os.mkdir('images') if not os.path.exists(dir): os.mkdir(dir) if not os.path.exists(dir + 'train/'): os.mkdir(dir + 'train/') if not os.path.exists(dir + 'test/'): os.mkdir(dir + 'test/') for index in range(len(y_test)): print('test ' + str(index)) x_ = x_test[index] label = np.argmax(y_test[index]) raw_data = (fgsm.generate_np(x_.reshape( (1, 28, 28, 1)), **fgsm_params).reshape( (28, 28)) * 255).astype('uint8') im = Image.fromarray(raw_data, mode='P') im.save(dir + 'test/' + str(label) + '_' + str(uuid.uuid4()) + '.png') for index in range(len(y_train)): print('train ' + str(index)) x_ = x_train[index] label = np.argmax(y_train[index]) raw_data = (fgsm.generate_np(x_.reshape( (1, 28, 28, 1)), **fgsm_params).reshape( (28, 28)) * 255).astype('uint8') im = Image.fromarray(raw_data, mode='P') im.save(dir + 'train/' + str(label) + '_' + str(uuid.uuid4()) + '.png') return report
def evaluate_model(filepath, train_start=0, train_end=60000, test_start=0, test_end=10000, batch_size=128, testing=False, num_threads=None): """ Run evaluation on a saved model :param filepath: path to model to evaluate :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param batch_size: size of evaluation batches """ # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.INFO) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) eval_params = {'batch_size': batch_size} fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) with sess.as_default(): model = load(filepath) assert len(model.get_params()) > 0 # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_logits(adv_x) preds = model.get_logits(x) # Evaluate the accuracy of the MNIST model on adversarial examples do_eval(preds, x_test, y_test, 'train_clean_train_clean_eval', False) do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True)
def mnist_tutorial_fgsm(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, attack_iterations=ATTACK_ITERATIONS, model_path=MODEL_PATH, targeted=TARGETED, noise_output=NOISE_OUTPUT): """ MNIST tutorial for Fast Gradient Method's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x_train, y_train, args=train_params, rng=rng) saver = tf.train.Saver() saver.save(sess, model_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a FGSM attack object fgsm = FastGradientMethod(model, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes)] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 1, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array( [[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape((source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" if targeted: fgsm_params_batch_size = source_samples * nb_classes else: fgsm_params_batch_size = source_samples fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv = fgsm.generate_np(adv_inputs, **fgsm_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval( sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: err = model_eval(sess, x, y, preds, adv, y_test[idxs], args=eval_params) adv_accuracy = 1 - err else: err = model_eval(sess, x, y, preds, adv, y_test[:source_samples], args=eval_params) adv_accuracy = 1 - err if viz_enabled: for i in range(nb_classes): if noise_output: image = adv[i * nb_classes] - adv_inputs[i * nb_classes] else: image = adv[i * nb_classes] grid_viz_data[i, 0] = image print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) ########################################################################### # Adversarial Training ########################################################################### model2 = ModelBasicCNN('model2', nb_classes, nb_filters) fgsm2 = FastGradientMethod(model2, sess=sess) def attack_fgsm(x): return fgsm2.generate(adv_inputs, **fgsm_params) preds2 = model2.get_logits(x) loss2 = CrossEntropy(model2, smoothing=0.1, attack=attack_fgsm) train(sess, loss2, x_train, y_train, args=train_params, rng=rng) eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds2, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on adversarial fgsm test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy print("Defined TensorFlow model graph.") eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval( sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: err = model_eval(sess, x, y, preds, adv, y_test[idxs], args=eval_params) adv_accuracy = 1 - err else: err = model_eval(sess, x, y, preds, adv, y_test[:source_samples], args=eval_params) adv_accuracy = 1 - err if viz_enabled: for i in range(nb_classes): if noise_output: image = adv[i * nb_classes] - adv_inputs[i * nb_classes] else: image = adv[i * nb_classes] grid_viz_data[i, 0] = image print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() def save_visual(data, path): """ Modified version of cleverhans.plot.pyplot """ figure = plt.figure() # figure.canvas.set_window_title('Cleverhans: Grid Visualization') # Add the images to the plot num_cols = data.shape[0] num_rows = data.shape[1] num_channels = data.shape[4] for y in range(num_rows): for x in range(num_cols): figure.add_subplot(num_rows, num_cols, (x + 1) + (y * num_cols)) plt.axis('off') if num_channels == 1: plt.imshow(data[x, y, :, :, 0], cmap='gray') else: plt.imshow(data[x, y, :, :, :]) # Draw the plot and return plt.savefig(path) return figure # Finally, block & display a grid of all the adversarial examples if viz_enabled: # _ = grid_visual(grid_viz_data) # cleverhans_image.save("output", grid_viz_data) if noise_output: image_name = "output/fgsm_mnist_noise.png" else: image_name = "output/fgsm_mnist.png" _ = save_visual(grid_viz_data, image_name) return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, testing=False, label_smoothing=0.1): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param testing: if true, training error is calculated :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Force TensorFlow to use single thread to improve reproducibility config = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) if keras.backend.image_data_format() != 'channels_last': raise NotImplementedError("this tutorial requires keras to be configured to channels_last format") # Create TF session and set as Keras backend session sess = tf.Session(config=config) keras.backend.set_session(sess) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Label smoothing y_train -= label_smoothing * (y_train - 1. / nb_classes) # Define Keras model model = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) print("Defined Keras model.") # To be able to call the model in the custom loss, we need to call it once # before, see https://github.com/tensorflow/tensorflow/issues/23769 model(model.input) # Initialize the Fast Gradient Sign Method (FGSM) attack object wrap = KerasModelWrapper(model) fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_acc_metric = get_adversarial_acc_metric(model, fgsm, fgsm_params) model.compile( optimizer=keras.optimizers.Adam(learning_rate), loss='categorical_crossentropy', metrics=['accuracy', adv_acc_metric] ) # Train an MNIST model model.fit(x_train, y_train, batch_size=batch_size, epochs=nb_epochs, validation_data=(x_test, y_test), verbose=2) # Evaluate the accuracy on legitimate and adversarial test examples _, acc, adv_acc = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=0) report.clean_train_clean_eval = acc report.clean_train_adv_eval = adv_acc print('Test accuracy on legitimate examples: %0.4f' % acc) print('Test accuracy on adversarial examples: %0.4f\n' % adv_acc) # Calculate training error if testing: _, train_acc, train_adv_acc = model.evaluate(x_train, y_train, batch_size=batch_size, verbose=0) report.train_clean_train_clean_eval = train_acc report.train_clean_train_adv_eval = train_adv_acc print("Repeating the process, using adversarial training") # Redefine Keras model model_2 = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) model_2(model_2.input) wrap_2 = KerasModelWrapper(model_2) fgsm_2 = FastGradientMethod(wrap_2, sess=sess) # Use a loss function based on legitimate and adversarial examples adv_loss_2 = get_adversarial_loss(model_2, fgsm_2, fgsm_params) adv_acc_metric_2 = get_adversarial_acc_metric(model_2, fgsm_2, fgsm_params) model_2.compile( optimizer=keras.optimizers.Adam(learning_rate), loss=adv_loss_2, metrics=['accuracy', adv_acc_metric_2] ) # Train an MNIST model model_2.fit(x_train, y_train, batch_size=batch_size, epochs=nb_epochs, validation_data=(x_test, y_test), verbose=2) # Evaluate the accuracy on legitimate and adversarial test examples _, acc, adv_acc = model_2.evaluate(x_test, y_test, batch_size=batch_size, verbose=0) report.adv_train_clean_eval = acc report.adv_train_adv_eval = adv_acc print('Test accuracy on legitimate examples: %0.4f' % acc) print('Test accuracy on adversarial examples: %0.4f\n' % adv_acc) # Calculate training error if testing: _, train_acc, train_adv_acc = model_2.evaluate(x_train, y_train, batch_size=batch_size, verbose=0) report.train_adv_train_clean_eval = train_acc report.train_adv_train_adv_eval = train_adv_acc return report
def mnist_blackbox(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_classes=NB_CLASSES, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, nb_epochs=NB_EPOCHS, holdout=HOLDOUT, data_aug=DATA_AUG, nb_epochs_s=NB_EPOCHS_S, lmbda=LMBDA, aug_batch_size=AUG_BATCH_SIZE): """ MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697 :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :return: a dictionary with: * black-box model accuracy on test set * substitute model accuracy on test set * black-box model accuracy on adversarial examples transferred from the substitute model """ # Set logging level to see debug information set_log_level(logging.DEBUG) # Dictionary used to keep track and return key accuracies accuracies = {} # Perform tutorial setup assert setup_tutorial() # Create TF session sess = tf.compat.v1.Session() # Get MNIST data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Initialize substitute training set reserved for adversary x_sub = x_test[:holdout] y_sub = np.argmax(y_test[:holdout], axis=1) # Redefine test set as remaining samples unavailable to adversaries x_test = x_test[holdout:] y_test = y_test[holdout:] # Obtain Image parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.compat.v1.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.compat.v1.placeholder(tf.float32, shape=(None, nb_classes)) # Seed random number generator so tutorial is reproducible rng = np.random.RandomState([2017, 8, 30]) # Simulate the black-box model locally # You could replace this by a remote labeling API for instance print("Preparing the black-box model.") prep_bbox_out = prep_bbox(sess, x, y, x_train, y_train, x_test, y_test, nb_epochs, batch_size, learning_rate, rng, nb_classes, img_rows, img_cols, nchannels) model, bbox_preds, accuracies['bbox'] = prep_bbox_out # Train substitute using method from https://arxiv.org/abs/1602.02697 print("Training the substitute model.") train_sub_out = train_sub(sess, x, y, bbox_preds, x_sub, y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda, aug_batch_size, rng, img_rows, img_cols, nchannels) model_sub, preds_sub = train_sub_out # Evaluate the substitute model on clean test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_sub, x_test, y_test, args=eval_params) accuracies['sub'] = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object. fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model_sub, sess=sess) # Craft adversarial examples using the substitute eval_params = {'batch_size': batch_size} x_adv_sub = fgsm.generate(x, **fgsm_par) # Evaluate the accuracy of the "black-box" model on adversarial examples accuracy = model_eval(sess, x, y, model.get_logits(x_adv_sub), x_test, y_test, args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute: ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex'] = accuracy return accuracies
def run_mnist_adv( num_epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, test_ae=False, # Test CNN with AE preprocessing test_cae=False, # test CNN with CAE preprocessing test_dae=False, # test CNN with DAE preprocessing test_stacked_dae=False, # test CNN with Stacked DAE preprocessing v_noises=[0.1, 0.2, 0.3, 0.4, 0.5], lambdas=[1e-5, 1e-4, 1e-3, 1e-2, 1e-1], num_stacks=3): # ====================================================================== # General Setup # ====================================================================== # Object used to keep track of (and return) key accuracies report = AccuracyReport() # can use gpu config = tf.ConfigProto(device_count={'GPU': 1, 'CPU': 1}) # Create TF session and set Keras backend session as TF sess = tf.Session(config=config) keras.backend.set_session(sess) # Get MNIST test data mnist = MNIST() x_train, y_train = mnist.get_set("train") x_test, y_test = mnist.get_set("test") # Obtain image params n_rows, n_cols, n_channels = x_train.shape[1:4] n_classes = y_train.shape[1] cnn_name = "cnn" # ====================================================================== # Test with AE # ====================================================================== if test_ae: ae_model = DenoisingAutoencoder((n_rows, n_cols, n_channels)) ae_model.load_weights(f"{MODEL_PATH}/autoencoder.hdf5", by_name=False) final_out = ConvNet((n_rows, n_cols, n_channels), n_classes, concat=True, concat_layer=ae_model.output) combined_model = Model(inputs=ae_model.input, outputs=final_out) cnn_model = ConvNet((n_rows, n_cols, n_channels), n_classes) cnn_model.load_weights(f"{MODEL_PATH}/{cnn_name}.hdf5", by_name=False) num_ae_layers = len(ae_model.layers) num_cnn_layers = len(cnn_model.layers) for i in range(len(combined_model.layers)): if i < num_ae_layers: weights = ae_model.layers[i].get_weights() combined_model.layers[i].set_weights(weights) else: weights = cnn_model.layers[i - num_ae_layers + 1].get_weights() combined_model.layers[i].set_weights(weights) combined_model(combined_model.input) wrap = KerasModelWrapper(combined_model) fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_acc_metric = get_adversarial_acc_metric(combined_model, fgsm, fgsm_params) combined_model.compile(optimizer=keras.optimizers.Adam(learning_rate), loss='categorical_crossentropy', metrics=['accuracy', adv_acc_metric]) _, acc, adv_acc = combined_model.evaluate(x_test, y_test, batch_size=batch_size, verbose=0) print('Test accuracy on legitimate examples: %0.4f' % acc) print('Test accuracy on adversarial examples: %0.4f\n' % adv_acc) np.savetxt("ae_accuracies_whitebox.npy", np.array([acc, adv_acc])) # ====================================================================== # Test with CAE # ====================================================================== if test_cae: cae_adv_accuracies = [] cae_accuracies = [] for lam in lambdas: cae_model = ContractiveAutoencoder((n_rows, n_cols, n_channels)) cae_model.load_weights( f"{MODEL_PATH}/contractive_autoencoder_{lam}.hdf5", by_name=False) final_out = ConvNet((n_rows, n_cols, n_channels), n_classes, concat=True, concat_layer=cae_model.output) combined_model = Model(inputs=cae_model.input, outputs=final_out) cnn_model = ConvNet((n_rows, n_cols, n_channels), n_classes) cnn_model.load_weights(f"{MODEL_PATH}/{cnn_name}.hdf5", by_name=False) num_cae_layers = len(cae_model.layers) num_cnn_layers = len(cnn_model.layers) for i in range(len(combined_model.layers)): if i < num_cae_layers: weights = cae_model.layers[i].get_weights() combined_model.layers[i].set_weights(weights) else: weights = cnn_model.layers[i - num_cae_layers + 1].get_weights() combined_model.layers[i].set_weights(weights) combined_model(combined_model.input) wrap = KerasModelWrapper(combined_model) fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_acc_metric = get_adversarial_acc_metric( combined_model, fgsm, fgsm_params) combined_model.compile( optimizer=keras.optimizers.Adam(learning_rate), loss='categorical_crossentropy', metrics=['accuracy', adv_acc_metric]) _, acc, adv_acc = combined_model.evaluate(x_test, y_test, batch_size=batch_size, verbose=0) cae_accuracies.append(acc) cae_adv_accuracies.append(adv_acc) print(f"Lambda = {lam}") print('Test accuracy on legitimate examples: %0.4f' % acc) print('Test accuracy on adversarial examples: %0.4f\n' % adv_acc) np.savetxt("cae_accuracies_whitebox.npy", np.array([cae_accuracies, cae_adv_accuracies])) # ====================================================================== # Test with DAE # ====================================================================== if test_dae: dae_adv_accuracies = [] dae_accuracies = [] for v_noise in v_noises: dae_model = DenoisingAutoencoder((n_rows, n_cols, n_channels)) dae_model.load_weights( f"{MODEL_PATH}/denoising_autoencoder_{v_noise}.hdf5", by_name=False) final_out = ConvNet((n_rows, n_cols, n_channels), n_classes, concat=True, concat_layer=dae_model.output) combined_model = Model(inputs=dae_model.input, outputs=final_out) cnn_model = ConvNet((n_rows, n_cols, n_channels), n_classes) cnn_model.load_weights(f"{MODEL_PATH}/{cnn_name}.hdf5", by_name=False) num_dae_layers = len(dae_model.layers) num_cnn_layers = len(cnn_model.layers) for i in range(len(combined_model.layers)): if i < num_dae_layers: weights = dae_model.layers[i].get_weights() combined_model.layers[i].set_weights(weights) else: weights = cnn_model.layers[i - num_dae_layers + 1].get_weights() combined_model.layers[i].set_weights(weights) combined_model(combined_model.input) wrap = KerasModelWrapper(combined_model) fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_acc_metric = get_adversarial_acc_metric( combined_model, fgsm, fgsm_params) combined_model.compile( optimizer=keras.optimizers.Adam(learning_rate), loss='categorical_crossentropy', metrics=['accuracy', adv_acc_metric]) _, acc, adv_acc = combined_model.evaluate(x_test, y_test, batch_size=batch_size, verbose=0) dae_accuracies.append(acc) dae_adv_accuracies.append(adv_acc) print(f"V_noise = {v_noise}") print('Test accuracy on legitimate examples: %0.4f' % acc) print('Test accuracy on adversarial examples: %0.4f\n' % adv_acc) np.savetxt("dae_accuracies_whitebox.npy", np.array([dae_accuracies, dae_adv_accuracies])) # ====================================================================== # Test with Stacked DAE # ====================================================================== if test_stacked_dae: stacked_dae_adv_accuracies = [] stacked_dae_accuracies = [] for v_noise in v_noises: stacked_dae_model = StackedDenoisingAutoencoder( (n_rows, n_cols, n_channels), num_stacks) stacked_dae_model.load_weights( f"{MODEL_PATH}/stacked_denoising_autoencoder_{num_stacks}_{v_noise}.hdf5", by_name=False) final_out = ConvNet((n_rows, n_cols, n_channels), n_classes, concat=True, concat_layer=stacked_dae_model.output) combined_model = Model(inputs=stacked_dae_model.input, outputs=final_out) cnn_model = ConvNet((n_rows, n_cols, n_channels), n_classes) cnn_model.load_weights(f"{MODEL_PATH}/{cnn_name}.hdf5", by_name=False) num_stacked_dae_layers = len(stacked_dae_model.layers) num_cnn_layers = len(cnn_model.layers) for i in range(len(combined_model.layers)): if i < num_stacked_dae_layers: weights = stacked_dae_model.layers[i].get_weights() combined_model.layers[i].set_weights(weights) else: weights = cnn_model.layers[i - num_stacked_dae_layers + 1].get_weights() combined_model.layers[i].set_weights(weights) combined_model(combined_model.input) wrap = KerasModelWrapper(combined_model) fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_acc_metric = get_adversarial_acc_metric( combined_model, fgsm, fgsm_params) combined_model.compile( optimizer=keras.optimizers.Adam(learning_rate), loss='categorical_crossentropy', metrics=['accuracy', adv_acc_metric]) _, acc, adv_acc = combined_model.evaluate(x_test, y_test, batch_size=batch_size, verbose=0) stacked_dae_accuracies.append(acc) stacked_dae_adv_accuracies.append(adv_acc) print(f"V_noise = {v_noise}") print('Test accuracy on legitimate examples: %0.4f' % acc) print('Test accuracy on adversarial examples: %0.4f\n' % adv_acc) np.savetxt( "stacked_dae_accuracies_whitebox.npy", np.array([stacked_dae_accuracies, stacked_dae_adv_accuracies])) return report
optimizer = tf.train.AdamOptimizer(learning_rate=1e-4) ####Fetch Data##### tf.reset_default_graph() tf.set_random_seed(1234) sess = tf.Session() train_start = 0 train_end = 1000 test_start = 1001 test_end = 1200 mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') img_rows, img_cols, nchannels = x_train.shape[1:4] x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) nb_classes = 10 #? Y_train.shape[1] nb_filters = 64 #? model = ResNet(scope="model1", nb_classes=nb_classes, nb_filters=nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.")
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, train_dir=TRAIN_DIR, filename=FILENAME, load_model=LOAD_MODEL, testing=False, label_smoothing=0.1): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ tf.keras.backend.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if keras.backend.image_data_format() != 'channels_last': raise NotImplementedError( "this tutorial requires keras to be configured to channels_last format" ) # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir): os.mkdir(train_dir) ckpt = tf.train.get_checkpoint_state(train_dir) print(train_dir, ckpt) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path wrap = KerasModelWrapper(model) if load_model and ckpt_path: saver = tf.train.Saver() print(ckpt_path) saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") loss = CrossEntropy(wrap, smoothing=label_smoothing) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate(x, **fgsm_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculating train error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_train, y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) wrap_2 = KerasModelWrapper(model_2) preds_2 = model_2(x) fgsm2 = FastGradientMethod(wrap_2, sess=sess) def attack(x): return fgsm2.generate(x, **fgsm_params) preds_2_adv = model_2(attack(x)) loss_2 = CrossEntropy(wrap_2, smoothing=label_smoothing, attack=attack) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, x_test, y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, x_test, y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training train(sess, loss_2, x_train, y_train, evaluate=evaluate_2, args=train_params, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, x_train, y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, x_train, y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) train(sess, loss, x_train, y_train, args=train_params, rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, sess=sess) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = x_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, nchannels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = x_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, nchannels)), np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, nchannels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, train_dir=TRAIN_DIR, filename=FILENAME, load_model=LOAD_MODEL, testing=True, label_smoothing=0.1): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ tf.keras.backend.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if keras.backend.image_data_format() != 'channels_last': raise NotImplementedError( "this tutorial requires keras to be configured to channels_last format" ) # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir): os.mkdir(train_dir) ckpt = tf.train.get_checkpoint_state(train_dir) print(train_dir, ckpt) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path wrap = KerasModelWrapper(model) if load_model and ckpt_path: saver = tf.train.Saver() print(ckpt_path) saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() eval_params = {'batch_size': batch_size} def eval(xx): acc = model_eval(sess, x, y, preds, xx, y_test, args=eval_params) print(acc) eval(x_test) model = load_modell(str("./autoencoder_bim.h5")) x_test_decoded = model.predict(x_test) eval(x_test_decoded) _, bim_test = pickle.load(open("/data/mnist/bim.pkl")) bim_test = bim_test.astype('float32') / 255. bim_test = np.mean(bim_test, axis=3) bim_test = np.expand_dims(bim_test, axis=3) bim_test_decoded = model.predict(bim_test) eval(bim_test) eval(bim_test_decoded)
# Set TF random seed to improve reproducibility tf.set_random_seed(1234) if tf.keras.backend.image_data_format() != 'channels_last': raise NotImplementedError( "this tutorial requires keras to be configured to channels_last format" ) # Create TF session and set as Keras backend session sess = tf.Session() tf.keras.backend.set_session(sess) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) nb_classes = 2 # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(X_train.shape[0], X_train.shape[1])) y = tf.placeholder(tf.float32, shape=(nb_classes)) model = tf.keras.models.Sequential() layers = [ tf.keras.layers.Dense(128), tf.keras.layers.Activation('relu'), tf.keras.layers.Dense(nb_classes) ]
def run_mnist_adv(num_epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, testing=False, learning_rate=LEARNING_RATE): # Object used to keep track of (and return) key accuracies report = AccuracyReport() # set random seed tf.set_random_seed(42) # can use gpu config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 1} ) # Create TF session and set Keras backend session as TF sess = tf.Session(config=config) keras.backend.set_session(sess) # Get MNIST test data mnist = MNIST() x_train, y_train = mnist.get_set("train") x_test, y_test = mnist.get_set("test") # Obtain image params n_rows, n_cols, n_channels = x_train.shape[1:4] n_classes = y_train.shape[1] # define TF model graph model = ConvNet((n_rows, n_cols, n_channels), n_classes) model(model.input) wrap = KerasModelWrapper(model) fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = { 'eps': 0.3, 'clip_min': 0., 'clip_max': 1. } adv_acc_metric = get_adversarial_acc_metric(model, fgsm, fgsm_params) model.compile( optimizer=keras.optimizers.Adam(learning_rate), loss='categorical_crossentropy', metrics=['accuracy', adv_acc_metric] ) # Train an MNIST model model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, validation_data=(x_test, y_test), verbose=1) # Evaluate the accuracy on legitimate and adversarial test examples _, acc, adv_acc = model.evaluate(x_test, y_test, batch_size=batch_size, verbose=0) report.clean_train_clean_eval = acc report.clean_train_adv_eval = adv_acc print('Test accuracy on legitimate examples: %0.4f' % acc) print('Test accuracy on adversarial examples: %0.4f\n' % adv_acc) # Calculate training error if testing: _, train_acc, train_adv_acc = model.evaluate(x_train, y_train, batch_size=batch_size, verbose=0) report.train_clean_train_clean_eval = train_acc report.train_clean_train_adv_eval = train_adv_acc print("Repeating the process, using adversarial training") # Redefine Keras model model_2 = ConvNet((n_rows, n_cols, n_channels), n_classes) model_2(model_2.input) wrap_2 = KerasModelWrapper(model_2) fgsm_2 = FastGradientMethod(wrap_2, sess=sess) # Use a loss function based on legitimate and adversarial examples adv_loss_2 = get_adversarial_loss(model_2, fgsm_2, fgsm_params) adv_acc_metric_2 = get_adversarial_acc_metric(model_2, fgsm_2, fgsm_params) model_2.compile( optimizer=keras.optimizers.Adam(learning_rate), loss=adv_loss_2, metrics=['accuracy', adv_acc_metric_2] ) # Train an MNIST model model_2.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, validation_data=(x_test, y_test), verbose=1) # Evaluate the accuracy on legitimate and adversarial test examples _, acc, adv_acc = model_2.evaluate(x_test, y_test, batch_size=batch_size, verbose=0) report.adv_train_clean_eval = acc report.adv_train_adv_eval = adv_acc print('Test accuracy on legitimate examples: %0.4f' % acc) print('Test accuracy on adversarial examples: %0.4f\n' % adv_acc) # Calculate training error if testing: _, train_acc, train_adv_acc = model_2.evaluate(x_train, y_train, batch_size=batch_size, verbose=0) report.train_adv_train_clean_eval = train_acc report.train_adv_train_adv_eval = train_adv_acc return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, clean_train=CLEAN_TRAIN, testing=False, backprop_through_attack=BACKPROP_THROUGH_ATTACK, nb_filters=NB_FILTERS, num_threads=None, label_smoothing=0.1): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) if clean_train: model = make_basic_picklable_cnn() # Tag the model so that when it is saved to disk, future scripts will # be able to tell what data it was trained on model.dataset_factory = mnist.get_factory() preds = model.get_logits(x) assert len(model.get_params()) > 0 loss = CrossEntropy(model, smoothing=label_smoothing) def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) train(sess, loss, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) with sess.as_default(): save("clean_model.joblib", model) print("Now that the model has been saved, you can evaluate it in a" " separate process using `evaluate_pickled_model.py`. " "You should get exactly the same result for both clean and " "adversarial accuracy as you get within this program.") # Calculate training error if testing: do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval') # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_logits(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True) # Calculate training error if testing: do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval') print('Repeating the process, using adversarial training') # Create a new model and train it to be robust to FastGradientMethod model2 = make_basic_picklable_cnn() # Tag the model so that when it is saved to disk, future scripts will # be able to tell what data it was trained on model2.dataset_factory = mnist.get_factory() fgsm2 = FastGradientMethod(model2, sess=sess) def attack(x): return fgsm2.generate(x, **fgsm_params) loss2 = CrossEntropy(model2, smoothing=label_smoothing, attack=attack) preds2 = model2.get_logits(x) adv_x2 = attack(x) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x2 = tf.stop_gradient(adv_x2) preds2_adv = model2.get_logits(adv_x2) def evaluate2(): # Accuracy of adversarially trained model on legitimate test inputs do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False) # Accuracy of the adversarially trained model on adversarial examples do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True) # Perform and evaluate adversarial training train(sess, loss2, x_train, y_train, evaluate=evaluate2, args=train_params, rng=rng, var_list=model2.get_params()) with sess.as_default(): save("adv_model.joblib", model2) print( "Now that the model has been saved, you can evaluate it in a " "separate process using " "`python evaluate_pickled_model.py adv_model.joblib`. " "You should get exactly the same result for both clean and " "adversarial accuracy as you get within this program." " You can also move beyond the tutorials directory and run the " " real `compute_accuracy.py` script (make sure cleverhans/scripts " "is in your PATH) to see that this FGSM-trained " "model is actually not very robust---it's just a model that trains " " quickly so the tutorial does not take a long time") # Calculate training errors if testing: do_eval(preds2, x_train, y_train, 'train_adv_train_clean_eval') do_eval(preds2_adv, x_train, y_train, 'train_adv_train_adv_eval') return report
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, attack_iterations=ATTACK_ITERATIONS, model_path=MODEL_PATH, model_path_cls=MODEL_PATH, targeted=TARGETED): # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) rng = np.random.RandomState() # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) nb_latent_size = 100 # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) x_t = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) y_t = tf.placeholder(tf.float32, shape=(None, nb_classes)) z = tf.placeholder(tf.float32, shape=(None, nb_latent_size)) z_t = tf.placeholder(tf.float32, shape=(None, nb_latent_size)) #nb_filters = 64 nb_layers = 500 # Define TF model graph model = ModelBasicAE('model', nb_layers, nb_latent_size) cl_model = ModelCls('cl_model') #preds = model.get_logits(x) recons = model.get_layer(x, 'RECON') loss = SquaredError(model) print("Defined TensorFlow model graph.") loss_cls = CrossEntropy(cl_model) y_logits = cl_model.get_layer(z, 'LOGITS') ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path)[-1] } train_params_cls = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path_cls)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model #if os.path.exists(model_path + ".meta"): # tf_model_load(sess, model_path) #else: eval_params_cls = {'batch_size': batch_size} # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerAE(model, cl_model, sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') grid_viz_data_1 = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * (nb_classes - 1) for instance in x_test[idxs]], dtype=np.float32) #adv_input_y = np.array([[instance]*(nb_classes-1) for instance in y_test[idxs]]) adv_input_y = [] for curr_num in range(nb_classes): targ = [] for id in range(nb_classes - 1): targ.append(y_test[idxs[curr_num]]) adv_input_y.append(targ) adv_input_y = np.array(adv_input_y) adv_target_y = [] for curr_num in range(nb_classes): targ = [] for id in range(nb_classes): if (id != curr_num): targ.append(y_test[idxs[id]]) adv_target_y.append(targ) adv_target_y = np.array(adv_target_y) #print("adv_input_y: \n", adv_input_y) #print("adv_target_y: \n", adv_target_y) adv_input_targets = [] for curr_num in range(nb_classes): targ = [] for id in range(nb_classes): if (id != curr_num): targ.append(x_test[idxs[id]]) adv_input_targets.append(targ) adv_input_targets = np.array(adv_input_targets) adv_inputs = adv_inputs.reshape((source_samples * (nb_classes - 1), img_rows, img_cols, nchannels)) adv_input_targets = adv_input_targets.reshape( (source_samples * (nb_classes - 1), img_rows, img_cols, nchannels)) adv_input_y = adv_input_y.reshape( source_samples * (nb_classes - 1), 10) adv_target_y = adv_target_y.reshape( source_samples * (nb_classes - 1), 10) #print("adv_input_y: \n", adv_input_y) #print("adv_target_y: \n", adv_target_y) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 train_ae(sess, loss, x_train, x_train, args=train_params, rng=rng, var_list=model.get_params()) saver = tf.train.Saver() saver.save(sess, model_path) x_train_lat = model.get_layer(x_train, 'LATENT') x_test_lat = model.get_layer(x_test, 'LATENT') x_train_lat = sess.run(x_train_lat) x_test_lat = sess.run(x_test_lat) def do_eval_cls(preds, x_set, y_set, x_tar_set, report_key, is_adv=None): acc = model_eval(sess, z, y, preds, z_t, x_set, y_set, x_tar_set, args=eval_params_cls) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) def eval_cls(): do_eval_cls(y_logits, x_test_lat, y_test, x_test_lat, 'clean_train_clean_eval', False) #train_cls(sess, loss_cls, x_train, y_train, evaluate = eval_cls, args = train_params_cls, rng = rng, var_list = cl_model.get_params()) train_cls_lat(sess, loss_cls, x_train_lat, y_train, evaluate=eval_cls, args=train_params_cls, rng=rng, var_list=cl_model.get_params()) saver.save(sess, model_path_cls) #adv_input_y = cl_model.get_layer(adv_inputs, 'LOGITS') #adv_target_y = cl_model.get_layer(adv_input_targets, 'LOGITS') adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" cw_params_batch_size = source_samples * (nb_classes - 1) cw_params = { 'binary_search_steps': 10, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': CW_LEARNING_RATE, 'batch_size': cw_params_batch_size, 'initial_const': 1 } adv = cw.generate_np(adv_inputs, adv_input_targets, **cw_params) #print("shaep of adv: ", np.shape(adv)) recon_orig = model.get_layer(adv_inputs, 'RECON') lat_adv = model.get_layer(adv, 'LATENT') recon_adv = model.get_layer(adv, 'RECON') lat_orig = model.get_layer(x, 'LATENT') lat_orig_recon = model.get_layer(recons, 'LATENT') #pred_adv_recon = cl_model.get_layer(recon_adv, 'LOGITS') pred_adv_recon = cl_model.get_layer(lat_adv, 'LOGITS') #eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} eval_params = {'batch_size': 90} if targeted: noise, d1, d2, dist_diff, avg_dist_lat = model_eval_ae( sess, x, x_t, recons, adv_inputs, adv_input_targets, adv, recon_adv, lat_orig, lat_orig_recon, args=eval_params) acc = model_eval(sess, x, y, pred_adv_recon, x_t, adv_inputs, adv_target_y, adv_input_targets, args=eval_params_cls) print("noise: ", noise) print("classifier acc: ", acc) recon_adv = sess.run(recon_adv) recon_orig = sess.run(recon_orig) #print("recon_adv[0]\n", recon_adv[0,:,:,0]) curr_class = 0 if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): #grid_viz_data[i, j] = adv[j * (nb_classes-1) + i] if (i == j): grid_viz_data[i, j] = recon_orig[curr_class * 9] grid_viz_data_1[i, j] = adv_inputs[curr_class * 9] curr_class = curr_class + 1 else: if (j > i): grid_viz_data[i, j] = recon_adv[i * (nb_classes - 1) + j - 1] grid_viz_data_1[i, j] = adv[i * (nb_classes - 1) + j - 1] else: grid_viz_data[i, j] = recon_adv[i * (nb_classes - 1) + j] grid_viz_data_1[i, j] = adv[i * (nb_classes - 1) + j] #rint(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session #sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: _ = grid_visual(grid_viz_data) _ = grid_visual(grid_viz_data_1) #return report #adversarial training if (adv_train == True): print("starting adversarial training") #sess1 = tf.Session() adv_input_set = [] adv_input_target_set = [] for i in range(20): indices = np.arange(np.shape(x_train)[0]) np.random.shuffle(indices) print("indices: ", indices[1:10]) x_train = x_train[indices] y_train = y_train[indices] idxs = [ np.where(np.argmax(y_train, axis=1) == i)[0][0] for i in range(nb_classes) ] adv_inputs_2 = np.array([[instance] * (nb_classes - 1) for instance in x_train[idxs]], dtype=np.float32) adv_input_targets_2 = [] for curr_num in range(nb_classes): targ = [] for id in range(nb_classes): if (id != curr_num): targ.append(x_train[idxs[id]]) adv_input_targets_2.append(targ) adv_input_targets_2 = np.array(adv_input_targets_2) adv_inputs_2 = adv_inputs_2.reshape( (source_samples * (nb_classes - 1), img_rows, img_cols, nchannels)) adv_input_targets_2 = adv_input_targets_2.reshape( (source_samples * (nb_classes - 1), img_rows, img_cols, nchannels)) adv_input_set.append(adv_inputs_2) adv_input_target_set.append(adv_input_targets_2) adv_input_set = np.array(adv_input_set), adv_input_target_set = np.array(adv_input_target_set) print("shape of adv_input_set: ", np.shape(adv_input_set)) print("shape of adv_input_target_set: ", np.shape(adv_input_target_set)) adv_input_set = np.reshape( adv_input_set, (np.shape(adv_input_set)[0] * np.shape(adv_input_set)[1] * np.shape(adv_input_set)[2], np.shape(adv_input_set)[3], np.shape(adv_input_set)[4], np.shape(adv_input_set)[5])) adv_input_target_set = np.reshape(adv_input_target_set, (np.shape(adv_input_target_set)[0] * np.shape(adv_input_target_set)[1], np.shape(adv_input_target_set)[2], np.shape(adv_input_target_set)[3], np.shape(adv_input_target_set)[4])) print("generated adversarial training set") adv_set = cw.generate_np(adv_input_set, adv_input_target_set, **cw_params) x_train_aim = np.append(x_train, adv_input_set, axis=0) x_train_app = np.append(x_train, adv_set, axis=0) model_adv_trained = ModelBasicAE('model_adv_trained', nb_layers, nb_latent_size) recons_2 = model_adv_trained.get_layer(x, 'RECON') loss_2 = SquaredError(model_adv_trained) train_ae(sess, loss_2, x_train_app, x_train_aim, args=train_params, rng=rng, var_list=model_adv_trained.get_params()) saver = tf.train.Saver() saver.save(sess, model_path) cw2 = CarliniWagnerAE(model_adv_trained, cl_model, sess=sess) adv_2 = cw2.generate_np(adv_inputs, adv_input_targets, **cw_params) #print("shaep of adv: ", np.shape(adv)) recon_orig = model_adv_trained.get_layer(adv_inputs, 'RECON') recon_adv = model_adv_trained.get_layer(adv_2, 'RECON') lat_orig = model_adv_trained.get_layer(x, 'LATENT') lat_orig_recon = model_adv_trained.get_layer(recons, 'LATENT') pred_adv_recon = cl_model.get_layer(recon_adv, 'LOGITS') #eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} eval_params = {'batch_size': 90} if targeted: noise, d1, d2, dist_diff, avg_dist_lat = model_eval_ae( sess, x, x_t, recons, adv_inputs, adv_input_targets, adv_2, recon_adv, lat_orig, lat_orig_recon, args=eval_params) acc = model_eval(sess, x, y, pred_adv_recon, x_t, adv_inputs, adv_target_y, adv_input_targets, args=eval_params_cls) print("noise: ", noise) #print("d1: ", d1) #print("d2: ", d2) #print("d1-d2: ", dist_diff) #print("Avg_dist_lat: ", avg_dist_lat) print("classifier acc: ", acc) recon_adv = sess.run(recon_adv) recon_orig = sess.run(recon_orig) #print("recon_adv[0]\n", recon_adv[0,:,:,0]) curr_class = 0 if viz_enabled: for j in range(nb_classes): for i in range(nb_classes): #grid_viz_data[i, j] = adv[j * (nb_classes-1) + i] if (i == j): grid_viz_data[i, j] = recon_orig[curr_class * 9] grid_viz_data_1[i, j] = adv_inputs[curr_class * 9] curr_class = curr_class + 1 else: if (j > i): grid_viz_data[i, j] = recon_adv[i * (nb_classes - 1) + j - 1] grid_viz_data_1[i, j] = adv_2[i * (nb_classes - 1) + j - 1] else: grid_viz_data[i, j] = recon_adv[i * (nb_classes - 1) + j] grid_viz_data_1[i, j] = adv_2[i * (nb_classes - 1) + j] #rint(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv_2 - adv_inputs)**2, axis=(1, 2, 3))**.5) print( 'Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: _ = grid_visual(grid_viz_data) _ = grid_visual(grid_viz_data_1) return report #binarization defense if (binarization_defense == True or mean_filtering == True): #adv = sess.run(adv) # print(adv[0]) if (binarization_defense == True): adv[adv > 0.5] = 1.0 adv[adv <= 0.5] = 0.0 else: #radius = 2 #adv_list = [mean(adv[i,:,:,0], disk(radius)) for i in range(0, np.shape(adv)[0])] #adv = np.array(adv_list) #adv = np.expand_dims(adv, axis = 3) adv = uniform_filter(adv, 2) #adv = median_filter(adv, 2) #print("after bin ") #print(adv[0]) recon_orig = model.get_layer(adv_inputs, 'RECON') recon_adv = model.get_layer(adv, 'RECON') lat_adv = model.get_layer(adv, 'LATENT') lat_orig = model.get_layer(x, 'LATENT') lat_orig_recon = model.get_layer(recons, 'LATENT') pred_adv_recon = cl_model.get_layer(lat_adv, 'LOGITS') #eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} eval_params = {'batch_size': 90} if targeted: noise, d1, d2, dist_diff, avg_dist_lat = model_eval_ae( sess, x, x_t, recons, adv_inputs, adv_input_targets, adv, recon_adv, lat_orig, lat_orig_recon, args=eval_params) acc1 = model_eval(sess, x, y, pred_adv_recon, x_t, adv_inputs, adv_target_y, adv_input_targets, args=eval_params_cls) acc2 = model_eval(sess, x, y, pred_adv_recon, x_t, adv_inputs, adv_input_y, adv_input_targets, args=eval_params_cls) print("noise: ", noise) print("classifier acc for target class: ", acc1) print("classifier acc for true class: ", acc2) recon_adv = sess.run(recon_adv) recon_orig = sess.run(recon_orig) #print("recon_adv[0]\n", recon_adv[0,:,:,0]) curr_class = 0 if viz_enabled: for j in range(nb_classes): for i in range(nb_classes): #grid_viz_data[i, j] = adv[j * (nb_classes-1) + i] if (i == j): grid_viz_data[i, j] = recon_orig[curr_class * 9] grid_viz_data_1[i, j] = adv_inputs[curr_class * 9] curr_class = curr_class + 1 else: if (j > i): grid_viz_data[i, j] = recon_adv[i * (nb_classes - 1) + j - 1] grid_viz_data_1[i, j] = adv[i * (nb_classes - 1) + j - 1] else: grid_viz_data[i, j] = recon_adv[i * (nb_classes - 1) + j] grid_viz_data_1[i, j] = adv[i * (nb_classes - 1) + j] sess.close() _ = grid_visual(grid_viz_data) _ = grid_visual(grid_viz_data_1)