def main(net_type): if keras.backend.image_dim_ordering() != 'th': keras.backend.set_image_dim_ordering('th') print "INFO: temporarily set 'image_dim_ordering' to 'th'" sess = get_session() keras.backend.set_session(sess) (train_xs, train_ys), (test_xs, test_ys) = data_cifar10.load_cifar10() print 'Loaded cifar10 data' x = tf.placeholder(tf.float32, shape=(None, 3, 32, 32)) y = tf.placeholder(tf.float32, shape=(None, 10)) model, model_name = resnet_cifar10.resnet_cifar10(repetations=3, net_type=net_type) if net_type == 'squared_resnet': model = adam_pretrain(model, model_name, train_xs, train_ys, 1, test_xs, test_ys) predictions = model(x) tf_model_train(sess, x, y, predictions, train_xs, train_ys, test_xs, test_ys, data_augmentor=data_cifar10.augment_batch) save_model(model, model_name) # Craft adversarial examples using Fast Gradient Sign Method (FGSM) adv_x = fgsm(x, predictions, eps=0.3) test_xs_adv, = batch_eval(sess, [x], [adv_x], [test_xs]) assert test_xs_adv.shape[0] == 10000, test_xs_adv.shape # Evaluate the accuracy of the MNIST model on adversarial examples accuracy = tf_model_eval(sess, x, y, predictions, test_xs_adv, test_ys) print'Test accuracy on adversarial examples: ' + str(accuracy) print "Repeating the process, using adversarial training" # Redefine TF model graph model_2, _ = resnet_cifar10.resnet_cifar10(repetations=3, net_type=net_type) predictions_2 = model_2(x) adv_x_2 = fgsm(x, predictions_2, eps=0.3) predictions_2_adv = model_2(adv_x_2) # Perform adversarial training tf_model_train(sess, x, y, predictions_2, train_xs, train_ys, test_xs, test_ys, predictions_adv=predictions_2_adv, data_augmentor=data_cifar10.augment_batch) save_model(model, model_name+'_adv') # Craft adversarial examples using Fast Gradient Sign Method (FGSM) on # the new model, which was trained using adversarial training test_xs_adv_2, = batch_eval(sess, [x], [adv_x_2], [test_xs]) assert test_xs_adv_2.shape[0] == 10000, test_xs_adv_2.shape # Evaluate the accuracy of the adversarially trained model on adversarial examples accuracy_adv = tf_model_eval(sess, x, y, predictions_2, test_xs_adv_2, test_ys) print'Test accuracy on adversarial examples: ' + str(accuracy_adv)
def basic_iterative_method(sess, model, X, Y, eps, eps_iter, nb_iter=50, clip_min=None, clip_max=None, batch_size=256): """ TODO :param sess: :param model: predictions or after-softmax :param X: :param Y: :param eps: :param eps_iter: :param nb_iter: :param clip_min: :param clip_max: :param batch_size: :return: """ sess.run(tf.initialize_all_variables()) # Define TF placeholders for the input and output x = tf.placeholder(tf.float32, shape=(None,)+X.shape[1:]) y = tf.placeholder(tf.float32, shape=(None,)+Y.shape[1:]) # results will hold the adversarial inputs at each iteration of BIM; # thus it will have shape (nb_iter, n_samples, n_rows, n_cols, n_channels) results = np.zeros((nb_iter, X.shape[0],) + X.shape[1:]) # Initialize adversarial samples as the original samples, set upper and # lower bounds X_adv = X X_min = X_adv - eps X_max = X_adv + eps print('Running BIM iterations...') # "its" is a dictionary that keeps track of the iteration at which each # sample becomes misclassified. The default value will be (nb_iter-1), the # very last iteration. def f(val): return lambda: val its = defaultdict(f(nb_iter-1)) # Out keeps track of which samples have already been misclassified out = set() for i in tqdm(range(nb_iter)): adv_x = fgsm( x, model(x), eps=eps_iter, clip_min=clip_min, clip_max=clip_max, y=y ) X_adv, = batch_eval( sess, [x, y], [adv_x], [X_adv, Y], feed={K.learning_phase(): 0}, args={'batch_size': batch_size} ) X_adv = np.maximum(np.minimum(X_adv, X_max), X_min) results[i] = X_adv # check misclassifieds #predictions = model.predict_classes(X_adv, batch_size=512, verbose=0) predict = model.predict(X_adv) predict=np.argmax(predict,axis=1) misclassifieds = np.where(predict != Y.argmax(axis=1))[0] for elt in misclassifieds: if elt not in out: its[elt] = i out.add(elt) return its, results
def fast_gradient_sign_method(sess, model, X, Y, eps, clip_min=None, clip_max=None, batch_size=256): """ TODO :param sess: :param model: predictions or after-softmax :param X: :param Y: :param eps: :param clip_min: :param clip_max: :param batch_size: :return: """ # Define TF placeholders for the input and output x = tf.placeholder(tf.float32, shape=(None, ) + X.shape[1:]) y = tf.placeholder(tf.float32, shape=(None, ) + Y.shape[1:]) adv_x = fgsm(x, model(x), eps=eps, clip_min=clip_min, clip_max=clip_max, y=y) X_adv, = batch_eval(sess, [x, y], [adv_x], [X, Y], feed={K.learning_phase(): 0}, args={'batch_size': batch_size}) return X_adv
def train_substitute(sess, x, y, bbox_preds, X_sub, Y_sub): """ This function creates the substitute by alternatively augmenting the training data and training the substitute. :param sess: TF session :param x: input TF placeholder :param y: output TF placeholder :param bbox_preds: output of black-box model predictions :param X_sub: initial substitute training data :param Y_sub: initial substitute training labels :return: """ # Define TF model graph (for the black-box model) model_sub = substitute_model() preds_sub = model_sub(x) # Define the Jacobian symbolically using TensorFlow grads = jacobian_graph(preds_sub, x, FLAGS.nb_classes) # Train the substitute and augment dataset alternatively for rho in range(FLAGS.data_aug): print("Epoch #" + str(rho)) train_params = { 'nb_epochs': FLAGS.nb_epochs_s, 'batch_size': FLAGS.batch_size, 'learning_rate': FLAGS.learning_rate } model_train(sess, x, y, preds_sub, X_sub, to_categorical(Y_sub), init_all=False, verbose=False, args=train_params) # If we are not at last substitute training iteration, augment dataset if rho < FLAGS.data_aug - 1: # Perform the Jacobian augmentation X_sub = jacobian_augmentation(sess, x, X_sub, Y_sub, grads, FLAGS.lmbda) # Label the newly generated synthetic points using the black-box Y_sub = np.hstack([Y_sub, Y_sub]) X_sub_prev = X_sub[int(len(X_sub)/2):] # First feed forward a denoising autoencoder. if args.ae: print("Denoising...") num_data = X_sub_prev.shape[0] autoencoder.visualize(sess, X_sub_prev.reshape(num_data, -1), "sub{}".format(rho)) filtered_data = autoencoder.run(sess, X_sub_prev.reshape(num_data, -1)) X_sub_prev = filtered_data.reshape(num_data, 28, 28, 1) if args.alg == "cnn": eval_params = {'batch_size': FLAGS.batch_size} bbox_val = batch_eval(sess, [x], [bbox_preds], [X_sub_prev], args=eval_params)[0] # Note here that we take the argmax because the adversary # only has access to the label (not the probabilities) output # by the black-box model Y_sub_prev = np.argmax(bbox_val, axis=1) elif is_not_nn(): x_sub_prev = X_sub_prev.reshape(X_sub_prev.shape[0], -1) Y_sub_prev = bbox_preds.predict(x_sub_prev) Y_sub[int(len(X_sub)/2):] = Y_sub_prev return model_sub, preds_sub
def get_activations(data): data_activations = {} for layer in layers: layer_sym = tf.layers.flatten(model.get_layer(x, layer)) data_activations[layer] = batch_eval(sess, [x], [layer_sym], [data], args={'batch_size': FLAGS.batch_size})[0] return data_activations
def train_sub(sess, x, y, bbox_preds, X_sub, Y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda, rng): """ This function creates the substitute by alternatively augmenting the training data and training the substitute. :param sess: TF session :param x: input TF placeholder :param y: output TF placeholder :param bbox_preds: output of black-box model predictions :param X_sub: initial substitute training data :param Y_sub: initial substitute training labels :param nb_classes: number of output classes :param nb_epochs_s: number of epochs to train substitute model :param batch_size: size of training batches :param learning_rate: learning rate for training :param data_aug: number of times substitute training data is augmented :param lmbda: lambda from arxiv.org/abs/1602.02697 :param rng: numpy.random.RandomState instance :return: """ # Define TF model graph (for the black-box model) model_sub = substitute_model() preds_sub = model_sub(x) log_raw.info("Defined TensorFlow model graph for the substitute.") # Define the Jacobian symbolically using TensorFlow grads = jacobian_graph(preds_sub, x, nb_classes) # Train the substitute and augment dataset alternatively for rho in xrange(data_aug): log_raw.info("Substitute training epoch #" + str(rho)) train_params = { 'nb_epochs': nb_epochs_s, 'batch_size': batch_size, 'learning_rate': learning_rate } model_train(sess, x, y, preds_sub, X_sub, to_categorical(Y_sub), init_all=False, verbose=False, args=train_params, rng=rng) # If we are not at last substitute training iteration, augment dataset if rho < data_aug - 1: log_raw.info("Augmenting substitute training data.") # Perform the Jacobian augmentation X_sub = jacobian_augmentation(sess, x, X_sub, Y_sub, grads, lmbda) log_raw.info("Labeling substitute training data.") # Label the newly generated synthetic points using the black-box Y_sub = np.hstack([Y_sub, Y_sub]) X_sub_prev = X_sub[int(len(X_sub)/2):] eval_params = {'batch_size': batch_size} bbox_val = batch_eval(sess, [x], [bbox_preds], [X_sub_prev], args=eval_params)[0] # Note here that we take the argmax because the adversary # only has access to the label (not the probabilities) output # by the black-box model Y_sub[int(len(X_sub)/2):] = np.argmax(bbox_val, axis=1) return model_sub, preds_sub
def main(argv=None): """ MNIST cleverhans tutorial :return: """ keras.layers.core.K.set_learning_phase(0) # Perform tutorial setup assert setup_tutorial() # Create TF session and set as Keras backend session config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) # sess = tf.Session() keras.backend.set_session(sess) # Get MNIST data X_train, Y_train, X_test, Y_test = data_mnist() # Initialize substitute training set reserved for adversary X_sub = X_test[:FLAGS.holdout] Y_sub = np.argmax(Y_test[:FLAGS.holdout], axis=1) # Redefine test set as remaining samples unavailable to adversaries X_test = X_test[FLAGS.holdout:] Y_test = Y_test[FLAGS.holdout:] # Define input and output TF placeholders x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Simulate the black-box model locally # You could replace this by a remote labeling API for instance print("Preparing the black-box model.") bbox = prep_bbox(X_train, Y_train, X_test, Y_test) print("Training the substitute model.") # Train substitute using method from https://arxiv.org/abs/1602.02697 model_sub, preds_sub = train_sub(sess, x, y, bbox, X_sub, Y_sub) # Initialize the Fast Gradient Sign Method (FGSM) attack object. fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model_sub, sess=sess) # Craft adversarial examples using the substitute eval_params = {'batch_size': FLAGS.batch_size} x_adv_sub = fgsm.generate(x, **fgsm_par) # Evaluate the accuracy of the "black-box" model on adversarial examples X_test_adv, = batch_eval(sess, [x], [x_adv_sub], [X_test], args=eval_params) x_test_adv = X_test_adv.reshape(X_test_adv.shape[0], -1) y_test = np.argmax(Y_test, axis=1) xg_adv = xgb.DMatrix(x_test_adv, label=y_test) pred_adv = bbox.predict(xg_adv) accuracy = np.sum(pred_adv == y_test) / y_test.shape[0] print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute: ' + str(accuracy))
def evaluate(model, sess, x, X_test, y_test, X_test_adv, attackType, print_image_index = [], testClean = False, saveFlag = True): from cleverhans.utils_tf import batch_eval eval_par = {'batch_size': 32} # Optionally test on clean examples if testClean: print("Clean Examples:") model_preds_clean = batch_eval(sess, [x], [model(x)], [X_test], args=eval_par)[0] printResults(model_preds_clean, y_test) print("") else: print("(Skipping clean examples)\n") # Evaluate results on adversarial examples print("Adversarial Examples:") model_preds = batch_eval(sess, [x], [model(x)], [X_test_adv], args=eval_par)[0] printResults(model_preds, y_test) np.save( "data/pgd_preds_" + attackType + ".npy" , model_preds) # Calculate L2 norm of pertrubations l2_norm = np.sum((X_test_adv - X_test)**2, axis=(1, 2, 3))**.5 l2_norm_sum = mean_ci(l2_norm) print('Avg. L2 norm of perturbations: ' + '{0:.6f} '.format(l2_norm_sum[0]) + \ '({0:.6f}'.format(l2_norm_sum[1]) + ' - {0:.6f})'.format(l2_norm_sum[2])) # Identify the most perturbed images from health and sick patients indMaxDiff_healthy = np.argmax(l2_norm[y_test[:,1] == 0]) indMaxDiff_sick = np.argmax(l2_norm[y_test[:,1] == 1]) indMaxDiff_sick_shifted = indMaxDiff_sick + np.nonzero(y_test[:,1] == 1)[0][0] print("Most perturbed images are " + str(indMaxDiff_healthy) + " and " + str(indMaxDiff_sick_shifted)) # Optionally save the most perturbed images and also any images whose indices are in print_image_index if saveFlag: for ind in print_image_index: scipy.misc.imsave('example_images/normal_img_' + str(ind) + '.png', deprocess_inception(X_test[ind])) scipy.misc.imsave('example_images/attack_pgd_img' + str(ind) + attackType + '.png', deprocess_inception(X_test_adv[ind])) scipy.misc.imsave('example_images/biggest_attack_' + attackType + '_img' + str(indMaxDiff_healthy) + '.png', deprocess_inception(X_test_adv[indMaxDiff_healthy])) scipy.misc.imsave('example_images/biggest_attack_' + attackType + '_img' + str(indMaxDiff_sick_shifted) + '.png', deprocess_inception(X_test_adv[indMaxDiff_sick_shifted])) return
def train_sub(sess, x, y, bbox_preds, X_sub, Y_sub): """ This function creates the substitute by alternatively augmenting the training data and training the substitute. :param sess: TF session :param x: input TF placeholder :param y: output TF placeholder :param bbox_preds: output of black-box model predictions :param X_sub: initial substitute training data :param Y_sub: initial substitute training labels :return: """ # Define TF model graph (for the black-box model) model_sub = substitute_model() preds_sub = model_sub(x) print("Defined TensorFlow model graph for the substitute.") # Define the Jacobian symbolically using TensorFlow grads = jacobian_graph(preds_sub, x, FLAGS.nb_classes) # Train the substitute and augment dataset alternatively for rho in xrange(FLAGS.data_aug): print("Substitute training epoch #" + str(rho)) train_params = { 'nb_epochs': FLAGS.nb_epochs_s, 'batch_size': FLAGS.batch_size, 'learning_rate': FLAGS.learning_rate } model_train(sess, x, y, preds_sub, X_sub, to_categorical(Y_sub), init_all=False, verbose=False, args=train_params) # If we are not at last substitute training iteration, augment dataset if rho < FLAGS.data_aug - 1: print("Augmenting substitute training data.") # Perform the Jacobian augmentation X_sub = jacobian_augmentation(sess, x, X_sub, Y_sub, grads, FLAGS.lmbda) print("Labeling substitute training data.") # Label the newly generated synthetic points using the black-box Y_sub = np.hstack([Y_sub, Y_sub]) X_sub_prev = X_sub[int(len(X_sub) / 2):] eval_params = {'batch_size': FLAGS.batch_size} bbox_val = batch_eval(sess, [x], [bbox_preds], [X_sub_prev], args=eval_params)[0] # Note here that we take the argmax because the adversary # only has access to the label (not the probabilities) output # by the black-box model Y_sub[int(len(X_sub) / 2):] = np.argmax(bbox_val, axis=1) return model_sub, preds_sub
def main(argv=None): """ MNIST cleverhans tutorial :return: """ # Perform tutorial setup assert setup_tutorial() # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST data X_train, Y_train, X_test, Y_test = data_mnist() # Initialize substitute training set reserved for adversary X_sub = X_test[:FLAGS.holdout] Y_sub = np.argmax(Y_test[:FLAGS.holdout], axis=1) # Redefine test set as remaining samples unavailable to adversaries X_test = X_test[FLAGS.holdout:] Y_test = Y_test[FLAGS.holdout:] # Define input and output TF placeholders x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Simulate the black-box model locally # You could replace this by a remote labeling API for instance print("Preparing the black-box model.") bbox_preds = prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test) print("Training the substitute model.") # Train substitute using method from https://arxiv.org/abs/1602.02697 substitute_preds = train_substitute(sess, x, y, bbox_preds, X_sub, Y_sub) # Craft adversarial examples using the substitute adv_x = fgsm(x, substitute_preds, eps=0.2) eval_params = {'batch_size': FLAGS.batch_size} X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], args=eval_params) np.save("xorg_black.npy", X_test) np.save("xadv_black.npy",X_test_adv) np.save("ytest_black.npy",Y_test) np.save("xtrain.npy", X_train) np.save("ytrain.npy",Y_train) # Evaluate the accuracy of the "black-box" model on adversarial examples accuracy = model_eval(sess, x, y, bbox_preds, X_test_adv, Y_test, args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute: ' + str(accuracy))
def train_substitute(bbox_preds, x_sub, y_sub, nb_classes, nb_epochs_s, batch_size, lr, data_aug, lmbda, aug_batch_size, rng, img_rows=28, img_cols=28, nchannels=1): model_sub = ModelSubstitute('model_s', nb_classes) preds_sub = model_sub.get_logits(x) loss_sub = CrossEntropy(model_sub, smoothing=0) print("Defined TensorFlow model graph for the substitute.") grads = jacobian_graph(preds_sub, x, nb_classes) for i in xrange(data_aug): print("Substitute training epoch #" + str(i)) train_params = { 'nb_epochs': nb_epochs_s, 'batch_size': batch_size, 'learning_rate': lr } with TemporaryLogLevel(logging.WARNING, "cleverhans.utils.tf"): train(sess, loss_sub, x_sub, to_categorical(y_sub, nb_classes), init_all=False, args=train_params, rng=rng, var_list=model_sub.get_params()) if i < data_aug - 1: print("Augmenting substitute training data.") lmbda_coef = 2 * int(int(i / 3) != 0) - 1 x_sub = jacobian_augmentation(sess, x, x_sub, y_sub, grads, lmbda_coef * lmbda, aug_batch_size) print("Labeling substitute training data.") y_sub = np.hstack([y_sub, y_sub]) x_sub_prev = x_sub[int(len(x_sub) / 2):] eval_params = {'batch_size': batch_size} bbox_val = batch_eval(sess, [x], [bbox_preds], [x_sub_prev], args=eval_params)[0] y_sub[int(len(x_sub) / 2):] = np.argmax(bbox_val, axis=1) show_plot(x_sub, y_sub) return model_sub, preds_sub, x_sub, y_sub
def main(argv=None): """ MNIST cleverhans tutorial :return: """ keras.layers.core.K.set_learning_phase(0) # Perform tutorial setup assert setup_tutorial() # Create TF session and set as Keras backend session config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) # sess = tf.Session() keras.backend.set_session(sess) if args.ae: autoencoder.restore(sess, args.ae) # Restore model weights from previously saved model # Get MNIST data X_train, Y_train, X_test, Y_test = data_mnist() # Initialize substitute training set reserved for adversary X_sub = X_test[:FLAGS.holdout] Y_sub = np.argmax(Y_test[:FLAGS.holdout], axis=1) # Shrink training data. # X_train = X_train[:10000] # Y_train = Y_train[:10000] # Redefine test set as remaining samples unavailable to adversaries X_test = X_test[FLAGS.holdout:] Y_test = Y_test[FLAGS.holdout:] # Define input and output TF placeholders x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) print("Preparing the black-box model.") if args.alg == "cnn": model, bbox = prep_cnn_bbox(sess, x, y, X_train, Y_train, X_test, Y_test) elif is_not_nn(): bbox = prep_boost_bbox(X_train, Y_train, X_test, Y_test) print("Training the substitute model.") model_sub, preds_sub = train_substitute(sess, x, y, bbox, X_sub, Y_sub) # Initialize the Fast Gradient Sign Method (FGSM) attack object. fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model_sub, sess=sess) # Craft adversarial examples using the substitute print("Crafting the adversarial examples.") eval_params = {'batch_size': FLAGS.batch_size} x_adv_sub = fgsm.generate(x, **fgsm_par) X_test_adv, = batch_eval(sess, [x], [x_adv_sub], [X_test], args=eval_params) # Dump adversarial examples. example_file = "example/{}.data".format(model_name) with open(example_file, "wb") as f: pickle.dump(X_test_adv, f) if args.ae: print("Denoising...") num_data = X_test_adv.shape[0] autoencoder.visualize(sess, X_test_adv.reshape(num_data, -1), "adv") filtered_data = autoencoder.run(sess, X_test_adv.reshape(num_data, -1)) X_test_adv = filtered_data.reshape(num_data, 28, 28, 1) # Evaluate the accuracy of the "black-box" model on adversarial examples if args.alg == "cnn": accuracy = model_eval(sess, x, y, bbox, X_test_adv, Y_test, args=eval_params) elif is_not_nn(): x_test_adv = X_test_adv.reshape(X_test_adv.shape[0], -1) y_test = np.argmax(Y_test, axis=1) accuracy = bbox.score(x_test_adv, y_test) print("Test adversarial accuracy = {}".format(accuracy)) log_file = "log/{}.log".format(model_name) with open(log_file, "a") as f: if args.ae: f.write("{}. Test adversarial accuracy = {}\n".format(args.ae, accuracy)) else: f.write("Test adversarial accuracy = {}\n".format(accuracy))
def train_sub(sess, x, y, bbox_preds, X_sub, Y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda, rng, substitute_model=None, dataset_name=None): """This function trains the substitute model as described in arxiv.org/abs/1602.02697 Args: sess: TF session x: input TF placeholder y: output TF placeholder bbox_preds: output of black-box model predictions X_sub: initial substitute training data Y_sub: initial substitute training labels nb_classes: number of output classes nb_epochs_s: number of epochs to train substitute model batch_size: size of training batches learning_rate: learning rate for training data_aug: number of times substitute training data is augmented lmbda: lambda from arxiv.org/abs/1602.02697 rng: numpy.random.RandomState instance Returns: model_sub: The substitute model function. preds_sub: The substitute prediction tensor. """ model_sub = substitute_model used_vars = model_sub.get_params() if FLAGS.load_sub_model: try: path = tf.train.latest_checkpoint('classifiers/sub_model/{}'.format(dataset_name)) saver = tf.train.Saver(var_list=used_vars) saver.restore(sess, path) print('[+] Sub model loaded successfully ...') pred_eval = model_sub.get_logits(x) return model_sub, pred_eval except: pass pred_train = model_sub.get_logits(x, dropout=True) pred_eval = model_sub.get_logits(x) print("Defined TensorFlow model graph for the substitute.") # Define the Jacobian symbolically using TensorFlow. grads = jacobian_graph(pred_eval, x, nb_classes) train_params = { 'nb_epochs': nb_epochs_s, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': 'classifiers/sub_model/{}'.format(dataset_name), 'filename': 'model_{}'.format(FLAGS.sub_model) } # Train the substitute and augment dataset alternatively. for rho in xrange(data_aug): print("Substitute training epoch #" + str(rho)) model_train(sess, x, y, pred_train, X_sub, convert_to_onehot(Y_sub), init_all=False, args=train_params, rng=rng, save=True) # If we are not at last substitute training iteration, augment dataset. if rho < data_aug - 1: print("Augmenting substitute training data.") # Perform the Jacobian augmentation. X_sub = jacobian_augmentation(sess, x, X_sub, Y_sub, grads, lmbda) print("Labeling substitute training data.") # Label the newly generated synthetic points using the black-box. Y_sub = np.hstack([Y_sub, Y_sub]) X_sub_prev = X_sub[int(len(X_sub) / 2):] eval_params = {'batch_size': batch_size} # To initialize the local variables of Defense-GAN. sess.run(tf.local_variables_initializer()) bbox_val = batch_eval(sess, [x], [bbox_preds], [X_sub_prev], args=eval_params)[0] # Note here that we take the argmax because the adversary # only has access to the label (not the probabilities) output # by the black-box model. Y_sub[int(len(X_sub) / 2):] = np.argmax(bbox_val, axis=1) return model_sub, pred_eval
def mnist_blackbox(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_classes=10, batch_size=128, learning_rate=0.001, nb_epochs=10, holdout=150, data_aug=6, nb_epochs_s=10, lmbda=0.1, attack="fgsm", targeted=False): """ MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697 :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :return: a dictionary with: * black-box model accuracy on test set * substitute model accuracy on test set * black-box model accuracy on adversarial examples transferred from the substitute model """ keras.layers.core.K.set_learning_phase(0) # Dictionary used to keep track and return key accuracies accuracies = {} # Perform tutorial setup assert setup_tutorial() # Create TF session and set as Keras backend session gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1.0) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) keras.backend.set_session(sess) # Get MNIST data if DATASET == "mnist": X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) else: X_train, Y_train, X_test, Y_test = data_cifar10() # Initialize substitute training set reserved for adversary X_sub = X_test[:holdout] Y_sub = np.argmax(Y_test[:holdout], axis=1) # Redefine test set as remaining samples unavailable to adversaries X_test = X_test[holdout:] Y_test = Y_test[holdout:] X_test = X_test[:FLAGS.n_attack] Y_test = Y_test[:FLAGS.n_attack] # Define input and output TF placeholders if DATASET == "mnist": x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) else: x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) y = tf.placeholder(tf.float32, shape=(None, 10)) # for feed targeted attack labels t_y = tf.placeholder(tf.float32, shape=(None, 10)) # Simulate the black-box model locally # You could replace this by a remote labeling API for instance print("Preparing the black-box model.") prep_bbox_out = prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test, nb_epochs, batch_size, learning_rate) model, bbox_preds, accuracies['bbox'] = prep_bbox_out # Train substitute using method from https://arxiv.org/abs/1602.02697 time_start = time.time() print("Training the substitute model.") train_sub_out = train_sub(sess, x, y, bbox_preds, X_sub, Y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda) model_sub, preds_sub = train_sub_out time_end = time.time() print("Substitue model training time:", time_end - time_start) # Evaluate the substitute model on clean test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_sub, X_test, Y_test, args=eval_params) accuracies['sub'] = acc print('substitution model accuracy:', acc) # Find the correctly predicted labels original_predict = batch_eval(sess, [x], [bbox_preds], [X_test], args=eval_params)[0] original_class = np.argmax(original_predict, axis = 1) true_class = np.argmax(Y_test, axis = 1) mask = true_class == original_class print(np.sum(mask), "out of", mask.size, "are correct labeled,", len(X_test[mask])) # Initialize the Fast Gradient Sign Method (FGSM) attack object. wrap = KerasModelWrapper(model_sub) # Craft adversarial examples using the substitute if targeted and attack == "fgsm": # TODO: fix the batch size mess eval_params = {'batch_size': FLAGS.n_attack * 9} else: eval_params = {'batch_size': batch_size} adv_inputs = X_test ori_labels = Y_test # generate targeted labels, 9 for each test example if targeted: adv_ys = [] targeted_class = [] for i in range(0, X_test.shape[0]): for j in range(0,10): # skip the original image label if j == np.argmax(Y_test[i]): continue adv_ys.append(np.eye(10)[j]) targeted_class.append(j) # duplicate the inputs by 9 times adv_inputs = np.array([[instance] * 9 for instance in X_test], dtype=np.float32) if DATASET == "mnist": adv_inputs = adv_inputs.reshape((X_test.shape[0] * 9, 28, 28, 1)) else: adv_inputs = adv_inputs.reshape((X_test.shape[0] * 9, 32, 32, 3)) # also update the mask mask = np.repeat(mask, 9) ori_labels = np.repeat(Y_test, 9, axis=0) adv_ys = np.array(adv_ys, dtype=np.float32) if attack == "fgsm": attacker_params = {'eps': 0.4, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.} # wrap = KerasModelWrapper(model) fgsm = FastGradientMethod(wrap, sess=sess) attacker = fgsm print("Running FGSM attack...") if targeted: attacker_params['y_target'] = t_y x_adv_sub = fgsm.generate(x, **attacker_params) else: print("Running Carlini and Wagner\'s L2 attack...") yname = "y" adv_ys = None # wrap = KerasModelWrapper(model) cwl2 = CarliniWagnerL2(wrap, back='tf', sess=sess) attacker_params = {'binary_search_steps': 9, 'max_iterations': 2000, 'abort_early': True, 'learning_rate': 0.01, 'batch_size': 1, 'initial_const': 0.01, 'confidence': 20} # generate targeted labels, 9 for each test example if targeted: attacker_params['y_target'] = adv_ys # attacker_params['batch_size'] = 9 attacker = cwl2 time_start = time.time() if attack == "fgsm": # Evaluate the accuracy of the "black-box" model on adversarial examples if targeted: accuracy = model_eval(sess, x, y, model(x_adv_sub), adv_inputs, ori_labels, feed={t_y: adv_ys}, args=eval_params) else: accuracy = model_eval(sess, x, y, model(x_adv_sub), adv_inputs, ori_labels, args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute: ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex'] = accuracy else: # Evaluate the accuracy of the "black-box" model on adversarial examples x_adv_sub_np = attacker.generate_np(adv_inputs, **attacker_params) accuracy = model_eval(sess, x, y, bbox_preds, x_adv_sub_np, ori_labels, args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute (NP): ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex'] = accuracy time_end = time.time() print('Attack time:', time_end - time_start) # Evaluate the targeted attack if attack == "fgsm": bbox_adv_predict = batch_eval(sess, [x], [model(x_adv_sub)], [adv_inputs], feed={t_y: adv_ys}, args=eval_params)[0] else: bbox_adv_predict = batch_eval(sess, [x], [bbox_preds], [x_adv_sub_np], args=eval_params)[0] bbox_adv_class = np.argmax(bbox_adv_predict, axis = 1) print(bbox_adv_class) print(true_class) true_class = np.argmax(ori_labels, axis = 1) untargeted_success = np.mean(bbox_adv_class != true_class) print('Untargeted attack success rate:', untargeted_success) accuracies['untargeted_success'] = untargeted_success if targeted: targeted_success = np.mean(bbox_adv_class == targeted_class) print('Targeted attack success rate:', targeted_success) accuracies['targeted_success'] = targeted_success if attack == "cwl2": # Compute the L2 pertubations of generated adversarial examples percent_perturbed = np.sum((x_adv_sub_np - adv_inputs)**2, axis=(1, 2, 3))**.5 print(percent_perturbed) # print('Avg. L_2 norm of perturbations {0:.4f}'.format(np.mean(percent_perturbed))) # when computing the mean, removing the failure attacks first print('Avg. L_2 norm of all perturbations {0:.4f}'.format(np.mean(percent_perturbed[percent_perturbed > 1e-8]))) print('Avg. L_2 norm of successful untargeted perturbations {0:.4f}'.format(np.mean(percent_perturbed[bbox_adv_class != true_class]))) if targeted: print('Avg. L_2 norm of successful targeted perturbations {0:.4f}'.format(np.mean(percent_perturbed[bbox_adv_class == targeted_class]))) # Evaluate the accuracy of the "black-box" model on adversarial examples accuracy = model_eval(sess, x, y, bbox_preds, adv_inputs[mask], ori_labels[mask], args=eval_params) print('Test accuracy of excluding originally incorrect labels (should be 1.0): ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex_exc_ori'] = accuracy if attack == "fgsm": # Evaluate the accuracy of the "black-box" model on adversarial examples (excluding correct) accuracy = model_eval(sess, x, y, model(x_adv_sub), adv_inputs[mask], ori_labels[mask], feed={t_y: adv_ys[mask]}, args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute (excluding originally incorrect labels): ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex_exc'] = accuracy else: # Evaluate the accuracy of the "black-box" model on adversarial examples (excluding correct) x_adv_sub_mask_np = x_adv_sub_np[mask] accuracy = model_eval(sess, x, y, bbox_preds, x_adv_sub_mask_np, ori_labels[mask], args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute (excluding originally incorrect labels, NP): ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex_exc'] = accuracy return accuracies
y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = model_mnist() predictions = model(x) print("Defined TensorFlow model graph.") # Train an MNIST model model_train(sess, x, y, predictions, X_train, Y_train, evaluate=evaluate) ############################################################################## # Create Adversarials ############################################################################## # Craft adversarial examples using Fast Gradient Sign Method (FGSM) adv_x = fgsm(x, predictions, eps=0.2) X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test]) assert X_test_adv.shape[0] == 10000, X_test_adv.shape # Evaluate the accuracy of the MNIST model on adversarial examples accuracy = model_eval(sess, x, y, predictions, X_test_adv, Y_test) print('Test accuracy on adversarial examples: ' + str(accuracy)) # save instances np.save('data/x_train.npy', X_train) np.save('data/y_train.npy', Y_train) np.save('data/x_test.npy', X_test) np.save('data/y_test.npy', Y_test) np.save('data/adversarials.npy', X_test_adv) # load instances X_train = np.load('data/x_train.npy')
def train_sub(sess, x, y, bbox_preds, X_sub, Y_sub, nb_classes=10, nb_epochs_s=250, batch_size=128, learning_rate=0.001, data_aug=6, lmbda=0.1, rng=None): """ This function creates the substitute by alternatively augmenting the training data and training the substitute. :param sess: TF session :param x: input TF placeholder :param y: output TF placeholder :param bbox_preds: output of black-box model predictions :param X_sub: initial substitute training data :param Y_sub: initial substitute training labels :param nb_classes: number of output classes :param nb_epochs_s: number of epochs to train substitute model :param batch_size: size of training batches :param learning_rate: learning rate for training :param data_aug: number of times substitute training data is augmented :param lmbda: lambda from arxiv.org/abs/1602.02697 :param rng: numpy.random.RandomState instance :return: """ # Define TF model graph (for the black-box model) model_wrapper = cifar10vgg(empty_model=True) model_sub = model_wrapper.model preds_sub = model_sub(x) print("Defined TensorFlow model graph for the substitute.") # Define the Jacobian symbolically using TensorFlow grads = jacobian_graph(preds_sub, x, nb_classes) # Train the substitute and augment dataset alternatively for rho in range(data_aug): print("Substitute training epoch #" + str(rho)) train_params = { 'nb_epochs': nb_epochs_s, 'batch_size': batch_size, 'learning_rate': learning_rate } with TemporaryLogLevel(tf.logging.WARNING, "cleverhans.utils.tf"): model_train(sess, x, y, preds_sub, X_sub, Y_sub, init_all=False, args=train_params) # If we are not at last substitute training iteration, augment dataset if rho < data_aug - 1: print("Augmenting substitute training data.") # Perform the Jacobian augmentation lmbda_coef = 2 * int(int(rho / 3) != 0) - 1 X_sub = jacobian_augmentation(sess, x, X_sub, Y_sub, grads, lmbda_coef * lmbda) print("Labeling substitute training data.") # Label the newly generated synthetic points using the black-box Y_sub = np.hstack([Y_sub, Y_sub]) X_sub_prev = X_sub[int(len(X_sub)/2):] eval_params = {'batch_size': batch_size} bbox_val = batch_eval(sess, [x], [bbox_preds], [X_sub_prev], args=eval_params)[0] # Note here that we take the argmax because the adversary # only has access to the label (not the probabilities) output # by the black-box model Y_sub[int(len(X_sub)/2):] = np.argmax(bbox_val, axis=1) return model_sub, preds_sub
def cifar_blackbox(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_classes=10, batch_size=128, learning_rate=0.001, nb_epochs=50, holdout=150, data_aug=6, nb_epochs_s=50, lmbda=0.1): """ CIFAR tutorial for the black-box attack from arxiv.org/abs/1602.02697 :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :return: a dictionary with: * black-box model accuracy on test set * substitute model accuracy on test set * black-box model accuracy on adversarial examples transferred from the substitute model """ keras.layers.core.K.set_learning_phase(0) # Dictionary used to keep track and return key accuracies accuracies = {} # Perform tutorial setup assert setup_tutorial() # Create TF session and set as Keras backend session gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) keras.backend.set_session(sess) # Get CIFAR data X_train, Y_train, X_test, Y_test = data_cifar10() # Initialize substitute training set reserved for adversary X_sub = X_test[:holdout] Y_sub = np.argmax(Y_test[:holdout], axis=1) # Redefine test set as remaining samples unavailable to adversaries X_test = X_test[holdout:] Y_test = Y_test[holdout:] # Define input and output TF placeholders x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Simulate the black-box model locally # You could replace this by a remote labeling API for instance print("Preparing the black-box model.") prep_bbox_out = prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test, nb_epochs, batch_size, learning_rate) model, bbox_preds, accuracies['bbox'] = prep_bbox_out # Train substitute using method from https://arxiv.org/abs/1602.02697 print("Training the substitute model.") train_sub_out = train_sub(sess, x, y, bbox_preds, X_sub, Y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda) model_sub, preds_sub = train_sub_out # Evaluate the substitute model on clean test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_sub, X_test, Y_test, args=eval_params) accuracies['sub'] = acc print('substitution model accuracy:', acc) # Find the correctly predicted labels original_predict = batch_eval(sess, [x], [bbox_preds], [X_test], args=eval_params)[0] original_class = np.argmax(original_predict, axis = 1) true_class = np.argmax(Y_test, axis = 1) mask = true_class == original_class print(np.sum(mask), "out of", mask.size, "are correct labeled,", len(X_test[mask])) # Initialize the Fast Gradient Sign Method (FGSM) attack object. fgsm_par = {'eps': 0.4, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.} wrap = KerasModelWrapper(model_sub) fgsm = FastGradientMethod(wrap, sess=sess) # Craft adversarial examples using the substitute eval_params = {'batch_size': batch_size} x_adv_sub = fgsm.generate(x, **fgsm_par) # Evaluate the accuracy of the "black-box" model on adversarial examples accuracy = model_eval(sess, x, y, model(x_adv_sub), X_test, Y_test, args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute: ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex'] = accuracy # Evaluate the accuracy of the "black-box" model on adversarial examples accuracy = model_eval(sess, x, y, bbox_preds, X_test[mask], Y_test[mask], args=eval_params) print('Test accuracy of excluding originally incorrect labels: ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex_exc_ori'] = accuracy # Evaluate the accuracy of the "black-box" model on adversarial examples accuracy = model_eval(sess, x, y, model(x_adv_sub), X_test[mask], Y_test[mask], args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute (excluding originally incorrect labels): ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex_exc'] = accuracy return accuracies
def train_sub(sess, x, y, bbox_preds, X_sub, Y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda, rng, substitute_model=None): """This function trains the substitute model as described in arxiv.org/abs/1602.02697 Args: sess: TF session x: input TF placeholder y: output TF placeholder bbox_preds: output of black-box model predictions X_sub: initial substitute training data Y_sub: initial substitute training labels nb_classes: number of output classes nb_epochs_s: number of epochs to train substitute model batch_size: size of training batches learning_rate: learning rate for training data_aug: number of times substitute training data is augmented lmbda: lambda from arxiv.org/abs/1602.02697 rng: numpy.random.RandomState instance Returns: model_sub: The substitute model function. preds_sub: The substitute prediction tensor. """ # Define TF model graph (for the black-box model). model_sub = substitute_model preds_sub = model_sub(x) print("Defined TensorFlow model graph for the substitute.") # Define the Jacobian symbolically using TensorFlow. grads = jacobian_graph(preds_sub, x, nb_classes) # Train the substitute and augment dataset alternatively. for rho in xrange(data_aug): print("Substitute training epoch #" + str(rho)) train_params = { 'nb_epochs': nb_epochs_s, 'batch_size': batch_size, 'learning_rate': learning_rate } model_train(sess, x, y, preds_sub, X_sub, to_categorical(Y_sub), init_all=False, args=train_params, rng=rng, feed={K.learning_phase(): 1}) # If we are not at last substitute training iteration, augment dataset. if rho < data_aug - 1: print("Augmenting substitute training data.") # Perform the Jacobian augmentation. X_sub = jacobian_augmentation(sess, x, X_sub, Y_sub, grads, lmbda, feed={K.learning_phase(): 0}) print("Labeling substitute training data.") # Label the newly generated synthetic points using the black-box. Y_sub = np.hstack([Y_sub, Y_sub]) X_sub_prev = X_sub[int(len(X_sub) / 2):] eval_params = {'batch_size': batch_size} # To initialize the local variables of Defense-GAN. sess.run(tf.local_variables_initializer()) bbox_val = batch_eval(sess, [x], [bbox_preds], [X_sub_prev], args=eval_params, feed={K.learning_phase(): 0})[0] # Note here that we take the argmax because the adversary # only has access to the label (not the probabilities) output # by the black-box model. Y_sub[int(len(X_sub) / 2):] = np.argmax(bbox_val, axis=1) return model_sub, preds_sub
def main(argv=None): """ MNIST cleverhans tutorial :return: """ # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'th': keras.backend.set_image_dim_ordering('th') print "INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to 'tf', temporarily setting to 'th'" # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) print "Created TensorFlow session and set Keras backend." # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist() print "Loaded MNIST test data." # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 1, 28, 28)) y = tf.placeholder(tf.float32, shape=(None, FLAGS.nb_classes)) # Define TF model graph model = model_mnist() predictions = model(x) print "Defined TensorFlow model graph." # Train an MNIST model tf_model_train(sess, x, y, predictions, X_train, Y_train) # Evaluate the accuracy of the MNIST model on legitimate test examples accuracy = tf_model_eval(sess, x, y, predictions, X_test, Y_test) assert X_test.shape[0] == 10000, X_test.shape print 'Test accuracy on legitimate test examples: ' + str(accuracy) # Craft adversarial examples using Fast Gradient Sign Method (FGSM) adv_x = fgsm(x, predictions, eps=0.3) X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test]) assert X_test_adv.shape[0] == 10000, X_test_adv.shape # Evaluate the accuracy of the MNIST model on adversarial examples accuracy = tf_model_eval(sess, x, y, predictions, X_test_adv, Y_test) print 'Test accuracy on adversarial examples: ' + str(accuracy) print "Repeating the process, using adversarial training" # Redefine TF model graph model_2 = model_mnist() predictions_2 = model_2(x) adv_x_2 = fgsm(x, predictions_2, eps=0.3) predictions_2_adv = model_2(adv_x_2) # Perform adversarial training tf_model_train(sess, x, y, predictions_2, X_train, Y_train, predictions_adv=predictions_2_adv) # Evaluate the accuracy of the adversarialy trained MNIST model on # legitimate test examples accuracy = tf_model_eval(sess, x, y, predictions_2, X_test, Y_test) print 'Test accuracy on legitimate test examples: ' + str(accuracy) # Craft adversarial examples using Fast Gradient Sign Method (FGSM) on # the new model, which was trained using adversarial training X_test_adv_2, = batch_eval(sess, [x], [adv_x_2], [X_test]) assert X_test_adv_2.shape[0] == 10000, X_test_adv_2.shape # Evaluate the accuracy of the adversarially trained MNIST model on # adversarial examples accuracy_adv = tf_model_eval(sess, x, y, predictions_2, X_test_adv_2, Y_test) print 'Test accuracy on adversarial examples: ' + str(accuracy_adv)
def main(argv=None): """ MNIST cleverhans tutorial :return: """ # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'th': keras.backend.set_image_dim_ordering('th') print( "INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to 'tf', temporarily setting to 'th'" ) # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) print("Created TensorFlow session and set Keras backend.") # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist() print("Loaded MNIST test data.") assert Y_train.shape[1] == 10. label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 1, 28, 28)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = model_mnist() predictions = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples accuracy = tf_model_eval(sess, x, y, predictions, X_test, Y_test) assert X_test.shape[0] == 10000, X_test.shape print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Train an MNIST model tf_model_train(sess, x, y, predictions, X_train, Y_train, evaluate=evaluate) # Craft adversarial examples using Fast Gradient Sign Method (FGSM) adv_x = fgsm(x, predictions, eps=0.3) X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test]) assert X_test_adv.shape[0] == 10000, X_test_adv.shape # Evaluate the accuracy of the MNIST model on adversarial examples accuracy = tf_model_eval(sess, x, y, predictions, X_test_adv, Y_test) print('Test accuracy on adversarial examples: ' + str(accuracy)) print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = model_mnist() predictions_2 = model_2(x) adv_x_2 = fgsm(x, predictions_2, eps=0.3) predictions_2_adv = model_2(adv_x_2) def evaluate_2(): # Evaluate the accuracy of the adversarialy trained MNIST model on # legitimate test examples accuracy = tf_model_eval(sess, x, y, predictions_2, X_test, Y_test) print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Evaluate the accuracy of the adversarially trained MNIST model on # adversarial examples accuracy_adv = tf_model_eval(sess, x, y, predictions_2_adv, X_test, Y_test) print('Test accuracy on adversarial examples: ' + str(accuracy_adv)) # Perform adversarial training tf_model_train(sess, x, y, predictions_2, X_train, Y_train, predictions_adv=predictions_2_adv, evaluate=evaluate_2)
def train_sub(sess, x, y, bbox_preds, X_sub, Y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda, aug_batch_size, rng, img_rows=28, img_cols=28, nchannels=1): """ This function creates the substitute by alternatively augmenting the training data and training the substitute. :param sess: TF session :param x: input TF placeholder :param y: output TF placeholder :param bbox_preds: output of black-box model predictions :param X_sub: initial substitute training data :param Y_sub: initial substitute training labels :param nb_classes: number of output classes :param nb_epochs_s: number of epochs to train substitute model :param batch_size: size of training batches :param learning_rate: learning rate for training :param data_aug: number of times substitute training data is augmented :param lmbda: lambda from arxiv.org/abs/1602.02697 :param rng: numpy.random.RandomState instance :return: """ # Define TF model graph (for the black-box model) model_sub = ModelSubstitute('model_s', nb_classes) preds_sub = model_sub.get_logits(x) loss_sub = LossCrossEntropy(model_sub, smoothing=0) print("Defined TensorFlow model graph for the substitute.") # Define the Jacobian symbolically using TensorFlow grads = jacobian_graph(preds_sub, x, nb_classes) # Train the substitute and augment dataset alternatively for rho in xrange(data_aug): print("Substitute training epoch #" + str(rho)) train_params = { 'nb_epochs': nb_epochs_s, 'batch_size': batch_size, 'learning_rate': learning_rate } with TemporaryLogLevel(logging.WARNING, "cleverhans.utils.tf"): train(sess, loss_sub, x, y, X_sub, to_categorical(Y_sub, nb_classes), init_all=False, args=train_params, rng=rng, var_list=model_sub.get_params()) # If we are not at last substitute training iteration, augment dataset if rho < data_aug - 1: print("Augmenting substitute training data.") # Perform the Jacobian augmentation lmbda_coef = 2 * int(int(rho / 3) != 0) - 1 X_sub = jacobian_augmentation(sess, x, X_sub, Y_sub, grads, lmbda_coef * lmbda, aug_batch_size) print("Labeling substitute training data.") # Label the newly generated synthetic points using the black-box Y_sub = np.hstack([Y_sub, Y_sub]) X_sub_prev = X_sub[int(len(X_sub)/2):] eval_params = {'batch_size': batch_size} bbox_val = batch_eval(sess, [x], [bbox_preds], [X_sub_prev], args=eval_params)[0] # Note here that we take the argmax because the adversary # only has access to the label (not the probabilities) output # by the black-box model Y_sub[int(len(X_sub)/2):] = np.argmax(bbox_val, axis=1) return model_sub, preds_sub
saved_path += '/wrn-28-10-t--2018-01-23-19-13/ResNet' # vanilla model.load_state(saved_path) cost, ev = model.test(ds_test) accuracies = [ev['accuracy']] for eps in epss[1:]: print("Creating adversarial examples...") clip_max = (255 - np.max(Cifar10Loader.mean)) / np.max( Cifar10Loader.std) n_fgsm = fgsm(model.nodes.input, model.nodes.probs, eps=eps, clip_min=-clip_max, clip_max=clip_max) images_adv, = batch_eval(model._sess, [model.nodes.input], [n_fgsm], [ds_test.images[:model.batch_size * 64]], args={'batch_size': model.batch_size}, feed={model._is_training: False}) adv_ds_test = Dataset(images_adv, ds_test.labels, ds_test.class_count) cost, ev = model.test(adv_ds_test) accuracies.append(ev['accuracy']) accuracieses.append(accuracies) print(accuracies) def plot(epss, curves, names): plt.figure() plt.rcParams["mathtext.fontset"] = "cm" #plt.yticks(np.arange(0, 1, 0.05)) axes = plt.gca() axes.grid(color='0.9', linestyle='-', linewidth=1) axes.set_ylim([0, 1])
top = 10 epss = [0, 0.02, 0.05, 0.2, 0.5, 1] image_count = 5 batch_size = 64 adv_image_lists = [ds_test.images[:batch_size]] for eps in epss[1:]: print("Creating adversarial examples...") clip_max = (255 - np.max(Cifar10Loader.mean)) / np.max(Cifar10Loader.std) n_fgsm = fgsm(model.nodes.input, model.nodes.probs, eps=eps, clip_min=-clip_max, clip_max=clip_max) images_adv, = batch_eval(model._sess, [model.nodes.input], [n_fgsm], [adv_image_lists[0]], args={'batch_size': batch_size}, feed={model._is_training: False}) adv_image_lists.append(images_adv) def generate_visualization(i0): def get_row(i): ims = adv_image_lists[i][i0:i0 + image_count] s, m = Cifar10Loader.std, Cifar10Loader.mean scale = lambda x: np.clip(x * s + m, 0, 255).astype(np.ubyte) return list(map(scale, ims)) cols = [get_row(i) for i in range(i0, i0 + len(epss))] return visualization.compose(cols, format=None) images = [im for i in range(i0, i0 + 3) for im in get_row(i)]
def train_sub(sess, logits_scalar, x, y, bbox_preds, X_sub, Y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda, rng, binary=False, phase=None, model_path=None): """ This function creates the substitute by alternatively augmenting the training data and training the substitute. :param sess: TF session :param x: input TF placeholder :param y: output TF placeholder :param bbox_preds: output of black-box model predictions :param X_sub: initial substitute training data :param Y_sub: initial substitute training labels :param nb_classes: number of output classes :param nb_epochs_s: number of epochs to train substitute model :param batch_size: size of training batches :param learning_rate: learning rate for training :param data_aug: number of times substitute training data is augmented :param lmbda: lambda from arxiv.org/abs/1602.02697 :param rng: numpy.random.RandomState instance :param phase: placeholder for batch_norm phase (training or testing) :param phase_val: True if training, False if testing :return: """ # Define TF model graph (for the black-box model) model_sub = substitute_model() preds_sub = model_sub(x) print("Defined TensorFlow model graph for the substitute.") # Define the Jacobian symbolically using TensorFlow grads = jacobian_graph(preds_sub, x, nb_classes) train_params = { 'binary': False, 'nb_epochs': nb_epochs_s, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': 'sub_model', 'train_scope': 'sub_model', 'reuse_global_step': False, 'is_training': True } # Train the substitute and augment dataset alternatively for rho in xrange(data_aug): print("Substitute training epoch #" + str(rho)) if rho > 0: train_params.update({'reuse_global_step': True}) if model_path is not None: train_params.update({'log_dir': model_path}) model_train(sess, x, y, preds_sub, X_sub, to_categorical(Y_sub), phase=phase, save=True, init_all=False, args=train_params, rng=rng) else: model_train(sess, x, y, preds_sub, X_sub, to_categorical(Y_sub), phase=phase, init_all=False, args=train_params, rng=rng) # If we are not at last substitute training iteration, augment dataset if rho < data_aug - 1: print("Augmenting substitute training data.") # Perform the Jacobian augmentation X_sub = jacobian_augmentation(sess, x, X_sub, Y_sub, grads, lmbda) print("Labeling substitute training data.") # Label the newly generated synthetic points using the black-box Y_sub = np.hstack([Y_sub, Y_sub]) X_sub_prev = X_sub[int(len(X_sub) / 2):] eval_params = {'batch_size': batch_size} bbox_val = batch_eval(sess, [x], [bbox_preds], [X_sub_prev], feed={phase: False}, args=eval_params)[0] # Note here that we take the argmax because the adversary # only has access to the label (not the probabilities) output # by the black-box model Y_sub[int(len(X_sub) / 2):] = np.argmax(bbox_val, axis=1) return model_sub, preds_sub
top_20_val = [100*feats[k] / y_test.shape[0] for k in top_20] plt.figure(figsize=(12, 6)) plt.bar(np.arange(20), top_20_val, align='center') plt.xticks(np.arange(20), X_test.columns[top_20], rotation='vertical') plt.title('Feature participation in adversarial examples') plt.ylabel('Percentage (%)') plt.xlabel('Features') plt.savefig('Adv_features.png', bbox_inches = "tight") # Craft adversarial examples using Fast Gradient Sign Method (FGSM) fgsm = FastGradientMethod(models, sess=sess) fgsm_params = {'eps': 0.3} adv_x_f = fgsm.generate(x, **fgsm_params) # adv_x_f = tf.stop_gradient(adv_x_f) X_test_adv, = batch_eval(sess, [x], [adv_x_f], [X_test_scaled]) # Evaluate accuracy eval_par = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions, X_test_adv, y_test, args=eval_par) print("Test accuracy on adversarial examples: {}".format(accuracy)) # Comparison of adversarial and original test samples (attack) feats = dict() total = 0 orig_attack = X_test_scaled - X_test_adv for i in range(0, orig_attack.shape[0]): ind = np.where(orig_attack[i, :] != 0)[0] total += len(ind) for j in ind:
def generate_images(): print('==> Preparing data..') if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print( "INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.5 sess = tf.Session(config=config) keras.backend.set_session(sess) print "==> Beginning Session" # Get CIFAR10 test data X_train, Y_train, X_test, Y_test = data_cifar10() assert Y_train.shape[1] == 10. label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Load model print "==> loading vgg model" args = load_args() if args.model == 'vgg6': model = vggbn(top=True, pool=args.pool) if args.model == 'vgg15': model = vgg15(top=True, pool=args.pool) if args.model == 'generic': model = generic(top=True, pool=args.pool) if args.model == 'resnet18': model = resnet.build_resnet_18(args.pool) predictions = model(x) model.load_weights(args.load) eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) print '==> Accuracy : {}'.format(accuracy) def evaluate(): # Evaluate the accuracy of the CIFAR10 model on legitimate test examples eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) assert X_test.shape[0] == 10000, X_test.shape print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Train an CIFAR10 model train_params = { 'nb_epochs': FLAGS.nb_epochs, 'batch_size': FLAGS.batch_size, 'learning_rate': FLAGS.learning_rate } im_base = '/im_' model_name = args.model + '_p' + str(args.pool) if args.attack == 'fgsm' or args.attack == 'FGSM': result_dir = os.getcwd() + '/images/fgsm/' print "==> creating fgsm adversarial wrapper" adv_x = fgsm(x, predictions, eps=0.3) print "==> sending to batch evaluator to finalize adversarial images" eval_params = {'batch_size': FLAGS.batch_size} X_train_adv, = batch_eval(sess, [x], [adv_x], [X_train], args=eval_params) i = 0 if not os.path.exists(result_dir + model_name): os.makedirs(result_dir + model_name) print "==> saving images to {}".format(result_dir + model_name) for ad in X_train_adv: scipy.misc.imsave( result_dir + model_name + im_base + str(i) + '.png', ad) i += 1 sess.close() """ JSMA """ if args.attack == 'jsma' or args.attack == 'JSMA': result_dir = os.getcwd() + '/images/jsma/trial_single_adv' print('Crafting ' + str(FLAGS.source_samples) + ' * ' + str(FLAGS.nb_classes - 1) + ' adversarial examples') results = np.zeros((FLAGS.nb_classes, FLAGS.source_samples), dtype='i') # This array contains the fraction of perturbed features for each test set perturbations = np.zeros((FLAGS.nb_classes, FLAGS.source_samples), dtype='f') # Define the TF graph for the model's Jacobian grads = jacobian_graph(predictions, x, FLAGS.nb_classes) # Initialize our array for grid visualization grid_shape = (FLAGS.nb_classes, FLAGS.nb_classes, FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels) grid_viz_data = np.zeros(grid_shape, dtype='f') i_saved = 0 n_image = 0 # Loop over the samples we want to perturb into adversarial examples print "==> saving images to {}".format(result_dir + model_name) for sample_ind in xrange(7166, FLAGS.source_samples): # We want to find an adversarial example for each possible target class current_class = int(np.argmax(Y_train[sample_ind])) target_classes = other_classes(FLAGS.nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( X_train[sample_ind:(sample_ind + 1)], (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels)) # Loop over all target classes adversarials = [] for idx, target in enumerate(target_classes): print "image {}".format(sample_ind) # here we hold all successful adversarials for this iteration # since we dont want 500k images, we will uniformly sample an image to save after each target print('--------------------------------------') print('Creating adv. example for target class ' + str(target)) # This call runs the Jacobian-based saliency map approach adv_x, res, percent_perturb = jsma( sess, x, predictions, grads, X_train[sample_ind:(sample_ind + 1)], target, theta=1, gamma=0.1, increase=True, back='tf', clip_min=0, clip_max=1) # Display the original and adversarial images side-by-side adversarial = np.reshape( adv_x, (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels)) original = np.reshape( X_train[sample_ind:(sample_ind + 1)], (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels)) if FLAGS.viz_enabled: if 'figure' not in vars(): figure = pair_visual(original, adversarial) else: figure = pair_visual(original, adversarial, figure) if not os.path.exists(result_dir + model_name): os.makedirs(result_dir + model_name) if res == 1: adversarials.append(adversarial) if idx == FLAGS.nb_classes - 2: try: if len(adversarials) == 1: idx_uniform = 0 else: idx_uniform = np.random.randint( 0, len(adversarials) - 1) print idx_uniform scipy.misc.imsave( result_dir + model_name + im_base + str(sample_ind) + '.png', adversarials[idx_uniform]) i_saved += 1 print "==> images saved: {}".format(i_saved) except: print "No adversarials generated" # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb n_image += 1 # Compute the number of adversarial examples that were successfuly found nb_targets_tried = ((FLAGS.nb_classes - 1) * FLAGS.source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print( 'Avg. rate of successful adv. examples {0:.2f}'.format(succ_rate)) # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.2f}'.format( percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print( 'Avg. rate of perturbed features for successful ' 'adversarial examples {0:.2f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if FLAGS.viz_enabled: _ = grid_visual(grid_viz_data)
def main(argv=None): """ MNIST cleverhans tutorial :return: """ # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist() assert Y_train.shape[1] == 10. label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() predictions = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) assert X_test.shape[0] == 10000, X_test.shape print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Train an MNIST model train_params = { 'nb_epochs': FLAGS.nb_epochs, 'batch_size': FLAGS.batch_size, 'learning_rate': FLAGS.learning_rate } model_train(sess, x, y, predictions, X_train, Y_train, evaluate=evaluate, args=train_params) # Craft adversarial examples using Fast Gradient Sign Method (FGSM) adv_x = fgsm(x, predictions, eps=0.3) eval_params = {'batch_size': FLAGS.batch_size} X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], args=eval_params) assert X_test_adv.shape[0] == 10000, X_test_adv.shape # Evaluate the accuracy of the MNIST model on adversarial examples accuracy = model_eval(sess, x, y, predictions, X_test_adv, Y_test, args=eval_params) print('Test accuracy on adversarial examples: ' + str(accuracy)) print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model() predictions_2 = model_2(x) adv_x_2 = fgsm(x, predictions_2, eps=0.3) predictions_2_adv = model_2(adv_x_2) def evaluate_2(): # Evaluate the accuracy of the adversarialy trained MNIST model on # legitimate test examples eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Evaluate the accuracy of the adversarially trained MNIST model on # adversarial examples accuracy_adv = model_eval(sess, x, y, predictions_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: ' + str(accuracy_adv)) # Perform adversarial training model_train(sess, x, y, predictions_2, X_train, Y_train, predictions_adv=predictions_2_adv, evaluate=evaluate_2, args=train_params)
def main(argv=None): """ CIFAR10 CleverHans tutorial :return: """ # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get CIFAR10 test data X_train, Y_train, X_test, Y_test = data_cifar10() assert Y_train.shape[1] == 10. label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model(img_rows=32, img_cols=32, channels=3) predictions = model(x) print "Defined TensorFlow model graph." def evaluate(): # Evaluate the accuracy of the CIFAR10 model on legitimate test # examples eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) assert X_test.shape[0] == 10000, X_test.shape print 'Test accuracy on legitimate test examples: ' + str(accuracy) # Train an CIFAR10 model train_params = { 'nb_epochs': FLAGS.nb_epochs, 'batch_size': FLAGS.batch_size, 'learning_rate': FLAGS.learning_rate } model_train(sess, x, y, predictions, X_train, Y_train, evaluate=evaluate, args=train_params) # Craft adversarial examples using Fast Gradient Sign Method (FGSM) # adv_x = fgsm(x, predictions, eps=0.3) mim = MIM(model, back='tf', sess=sess) mim_params = { 'eps_iter': 0.06, 'eps': 0.3, 'nb_iter': 10, 'ord': 2, 'decay_factor': 1.0 } adv_x = mim.generate(x, **mim_params) eval_params = {'batch_size': FLAGS.batch_size} X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], args=eval_params) assert X_test_adv.shape[0] == 10000, X_test_adv.shape accuracy = model_eval(sess, x, y, predictions, X_test_adv, Y_test, args=eval_params) print 'Test accuracy on adversarial examples: ' + str(accuracy) from scipy.misc import imsave path = '/home/neale/repos/adversarial-toolbox/images/adversarials/mim/cifar/symmetric/' """ for i, (real, adv) in enumerate(zip(X_test, X_test_adv)): imsave(path+'adv/adv_{}.png'.format(i), adv) """ preds = model_argmax(sess, x, predictions, X_test_adv) print Y_test.shape print preds.shape count = 0 for i in range(len(preds)): if np.argmax(Y_test[i]) == preds[i]: # imsave(path+'real/im_{}.png'.format(i), X_test[i]) # imsave(path+'adv/adv_{}.png'.format(i), X_test_adv[i]) count += 1 print "saved ", count
} model_train(sess, x, y, predictions, X_train, Y_train, args=train_params) # Evaluate the MNIST model eval_params = {'batch_size': 128} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Test accuracy on legitimate test examples: 0.9888 # Craft adversarial examples using the Fast Gradient Sign Method from cleverhans.attacks_tf import fgsm from cleverhans.utils_tf import batch_eval adv_x = fgsm(x, predictions, eps=0.3) X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], args=eval_params) accuracy = model_eval(sess, x, y, predictions, X_test_adv, Y_test, args=eval_params) print('Test accuracy on adversarial examples: ' + str(accuracy)) # Test accuracy on adversarial examples: 0.0837 # Adversarial training model_2 = cnn_model() predictions_2 = model_2(x) adv_x_2 = fgsm(x, predictions_2, eps=0.3) predictions_2_adv = model_2(adv_x_2) model_train(sess, x, y, predictions_2, X_train, Y_train, predictions_adv=predictions_2_adv, args=train_params) # Evaluate the accuracy on legitimate examples accuracy = model_eval(sess, x, y, predictions_2, X_test, Y_test, args=eval_param)
def calculate_signed_gradient_x(sess, x, predictions, X_test): signed_gradient = get_gradient_sign_tf(x, predictions) X_test_signed_gradient, = batch_eval(sess, [x], [signed_gradient], [X_test]) return X_test_signed_gradient
def train_sub(sess, x, y, bbox_preds, x_sub, y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda, aug_batch_size, rng, img_rows=28, img_cols=28, nchannels=1): """ This function creates the substitute by alternatively augmenting the training data and training the substitute. :param sess: TF session :param x: input TF placeholder :param y: output TF placeholder :param bbox_preds: output of black-box model predictions :param x_sub: initial substitute training data :param y_sub: initial substitute training labels :param nb_classes: number of output classes :param nb_epochs_s: number of epochs to train substitute model :param batch_size: size of training batches :param learning_rate: learning rate for training :param data_aug: number of times substitute training data is augmented :param lmbda: lambda from arxiv.org/abs/1602.02697 :param rng: numpy.random.RandomState instance :return: """ # Define TF model graph (for the black-box model) model_sub = ModelSubstitute('model_s', nb_classes) preds_sub = model_sub.get_logits(x) loss_sub = CrossEntropy(model_sub, smoothing=0) print("Defined TensorFlow model graph for the substitute.") # Define the Jacobian symbolically using TensorFlow grads = jacobian_graph(preds_sub, x, nb_classes) # Train the substitute and augment dataset alternatively for rho in xrange(data_aug): print("Substitute training epoch #" + str(rho)) train_params = { 'nb_epochs': nb_epochs_s, 'batch_size': batch_size, 'learning_rate': learning_rate } with TemporaryLogLevel(logging.WARNING, "cleverhans.utils.tf"): train(sess, loss_sub, x, y, x_sub, to_categorical(y_sub, nb_classes), init_all=False, args=train_params, rng=rng, var_list=model_sub.get_params()) # If we are not at last substitute training iteration, augment dataset if rho < data_aug - 1: print("Augmenting substitute training data.") # Perform the Jacobian augmentation lmbda_coef = 2 * int(int(rho / 3) != 0) - 1 x_sub = jacobian_augmentation(sess, x, x_sub, y_sub, grads, lmbda_coef * lmbda, aug_batch_size) print("Labeling substitute training data.") # Label the newly generated synthetic points using the black-box y_sub = np.hstack([y_sub, y_sub]) x_sub_prev = x_sub[int(len(x_sub) / 2):] eval_params = {'batch_size': batch_size} bbox_val = batch_eval(sess, [x], [bbox_preds], [x_sub_prev], args=eval_params)[0] # Note here that we take the argmax because the adversary # only has access to the label (not the probabilities) output # by the black-box model y_sub[int(len(x_sub) / 2):] = np.argmax(bbox_val, axis=1) return model_sub, preds_sub
def main(argv=None): """ CIFAR10 CleverHans tutorial :return: """ # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get CIFAR10 test data X_train, Y_train, X_test, Y_test = data_cifar10() assert Y_train.shape[1] == 10. label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model(img_rows=32, img_cols=32, channels=3) predictions = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the CIFAR10 model on legitimate test # examples eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) assert X_test.shape[0] == 10000, X_test.shape print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Train an CIFAR10 model train_params = { 'nb_epochs': FLAGS.nb_epochs, 'batch_size': FLAGS.batch_size, 'learning_rate': FLAGS.learning_rate } model_train(sess, x, y, predictions, X_train, Y_train, evaluate=evaluate, args=train_params) # Craft adversarial examples using Fast Gradient Sign Method (FGSM) fgsm = FastGradientMethod(model) adv_x = fgsm.generate(x, eps=0.3) eval_params = {'batch_size': FLAGS.batch_size} X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], args=eval_params) assert X_test_adv.shape[0] == 10000, X_test_adv.shape # Evaluate the accuracy of the CIFAR10 model on adversarial examples accuracy = model_eval(sess, x, y, predictions, X_test_adv, Y_test, args=eval_params) print('Test accuracy on adversarial examples: ' + str(accuracy)) print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model(img_rows=32, img_cols=32, channels=3) predictions_2 = model_2(x) fgsm_2 = FastGradientMethod(model_2) adv_x_2 = fgsm_2.generate(x, eps=0.3) predictions_2_adv = model_2(adv_x_2) def evaluate_2(): # Evaluate the accuracy of the adversarialy trained CIFAR10 model on # legitimate test examples eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate test examples: ' + str(accuracy)) # Evaluate the accuracy of the adversarially trained CIFAR10 model on # adversarial examples accuracy_adv = model_eval(sess, x, y, predictions_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: ' + str(accuracy_adv)) # Perform adversarial training model_train(sess, x, y, predictions_2, X_train, Y_train, predictions_adv=predictions_2_adv, evaluate=evaluate_2, args=train_params) # Evaluate the accuracy of the CIFAR10 model on adversarial examples accuracy = model_eval(sess, x, y, predictions_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: ' + str(accuracy))