def get_deepfool(self, sess, x, predictions, logits, sample, nb_candidate=10, overshoot=0.03, max_iter=30, feed=None): adv_x = copy.copy(sample) # Initialize the loop variables current = utils_tf.model_argmax(sess, x, logits, adv_x, feed=feed) if current.shape == (): current = np.array([current]) w = np.squeeze(np.zeros( sample.shape[1:4])) # same shape as original image r_tot = np.zeros(sample.shape) original = current # use original label as the reference iteration = 0 # Repeat this main loop until we have achieved misclassification while (np.any(current == original) and iteration < max_iter): feed.update({x: adv_x}) gradients, predictions_val = sess.run([self.grads, predictions], feed_dict=feed) for idx in range(sample.shape[0]): pert = np.inf if current[idx] != original[idx]: continue for k in range(1, nb_candidate): w_k = gradients[k][idx, ...] - gradients[0][idx, ...] f_k = predictions_val[idx, k] - predictions_val[idx, 0] # adding value 0.00001 to prevent f_k = 0 pert_k = (abs(f_k) + 1e-30) / np.linalg.norm(w_k.flatten()) if pert_k < pert: pert = pert_k w = w_k r_i = pert * w / np.linalg.norm(w) r_tot[idx, ...] = r_tot[idx, ...] + r_i # adv_x = np.clip(r_tot + sample, clip_min, clip_max) adv_x = r_tot + sample feed.update({x: adv_x}) current = utils_tf.model_argmax(sess, x, logits, adv_x, feed=feed) if current.shape == (): current = np.array([current]) # Update loop variables iteration = iteration + 1 # need to clip this image into the given range # adv_x = np.clip((1+overshoot)*r_tot + sample, clip_min, clip_max) adv_x = (1 + overshoot) * r_tot + sample return adv_x
def independent_single(x_test_cc): (x_train, y_train), (x_test, y_test) = cifar10.load_data() x_test = x_test.astype('float32') / 255 x_train = x_train.astype('float32') / 255 input_shape = x_train.shape[1:] sess = tf.Session() keras.backend.set_session(sess) model_input = Input(shape=input_shape) model_dic = {} model_out = [] model_logits = [] for i in range(3): model_dic[str(i)] = lenet_v1(X_input=model_input, num_classes=10) model_out.append(model_dic[str(i)][3]) model_logits.append(model_dic[str(i)][2]) model = Model(input=model_input, output=model_out) model.load_weights(filepath) pred = model(model_input) final_pred_list = [] clean_pred_list = [] confidence_list = [] entropy_list = [] for i in range(N_numbers): #sess.run(tf.global_variables_initializer()) # f = sess.run(final_features, feed_dict={model_input: x_test_cc}) # features # confidence / cross_entropy # en = -np.sum(soft * np.log2(soft)) # entropy_list.append(en) predictive_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=model_out[i], logits=model_logits[i]) confidence = tf.math.reduce_max(model_out[i], axis=-1) pe_adv, conf_adv = sess.run([predictive_entropy, confidence], feed_dict={model_input: x_test_cc}) entropy_list.append(pe_adv) confidence_list.append(conf_adv) # prediction final_pred = model_argmax(sess, model_input, pred[i], samples=x_test_cc) clean_pred = model_argmax(sess, model_input, pred[i], samples=x_test) final_pred_list.append(final_pred) clean_pred_list.append(clean_pred) return y_test, clean_pred_list, final_pred_list, entropy_list, confidence_list
def generate_attacks(save_path, file_path, x_set, y_set, attack, gamma, first_index, last_index): """ Applies the saliency map attack against the specified model. Parameters ---------- save_path: str The path of the folder in which the crafted adversarial samples will be saved. file_path: str The path to the joblib file of the model to attack. x_set: numpy.ndarray The dataset input array. y_set: numpy.ndarray The dataset output array. attack: str The type of used attack (either "jsma", "wjsma" or "tjsma"). gamma: float Maximum percentage of perturbed features. first_index: The index of the first image attacked. last_index: int The index of the last image attacked. """ if not os.path.exists(save_path): os.mkdir(save_path) sess = tf.Session() img_rows, img_cols, channels = x_set.shape[1:4] nb_classes = y_set.shape[1] x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) with sess.as_default(): model = load(file_path) assert len(model.get_params()) > 0 # Attack parameters. See SaliencyMapMethod for more information jsma = SaliencyMapMethod(model, sess=sess) jsma_params = { 'theta': 1, 'gamma': gamma, 'clip_min': 0., 'clip_max': 1., 'y_target': None, 'attack': attack } preds = model(x) for sample_ind in range(first_index, last_index): results = pd.DataFrame() print('Attacking input %i/%i' % (sample_ind + 1, last_index)) sample = x_set[sample_ind:(sample_ind + 1)] current_class = int(np.argmax(y_set[sample_ind])) target_classes = other_classes(nb_classes, current_class) for target in target_classes: one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x, predictions = jsma.generate_np(sample, **jsma_params) res = int(model_argmax(sess, x, preds, adv_x) == target) adv_x_reshape = adv_x.reshape(-1) test_in_reshape = x_set[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] results['number_' + str(sample_ind) + '_' + str(current_class) + '_to_' + str(target)] = \ np.concatenate((adv_x_reshape.reshape(-1), np.array([nb_changed, percent_perturb, res])) ) sample_vector = sample.reshape(-1) shape1 = sample_vector.shape[0] shape2 = results.shape[0] results['original_image_' + str(sample_ind)] = \ np.concatenate((sample.reshape(-1), np.zeros((shape2 - shape1,)))) results.to_csv(save_path + '/' + attack + '_image_' + str(sample_ind) + '.csv', index=False)
def jsma(sess, x, predictions, grads, sample, target, theta, gamma, increase, nb_classes, clip_min, clip_max, verbose=False): """ TensorFlow implementation of the jacobian-based saliency map method (JSMA). :param sess: TF session :param x: the input placeholder :param predictions: the model's symbolic output (linear output, pre-softmax) :param sample: numpy array with sample input :param target: target class for sample input :param theta: delta for each feature adjustment :param gamma: a float between 0 - 1 indicating the maximum distortion percentage :param increase: boolean; true if we are increasing pixels, false otherwise :param nb_classes: integer indicating the number of classes in the model :param clip_min: optional parameter that can be used to set a minimum value for components of the example returned :param clip_max: optional parameter that can be used to set a maximum value for components of the example returned :param verbose: boolean; whether to print status updates or not :return: an adversarial sample """ # Copy the source sample and define the maximum number of features # (i.e. the maximum number of iterations) that we may perturb adv_x = copy.copy(sample) # count the number of features. For MNIST, 1x28x28 = 784; for # CIFAR, 3x32x32 = 3072; etc. nb_features = np.product(adv_x.shape[1:]) # reshape sample for sake of standardization original_shape = adv_x.shape adv_x = np.reshape(adv_x, (1, nb_features)) # compute maximum number of iterations max_iters = np.floor(nb_features * gamma / 2) if verbose: print('Maximum number of iterations: {0}'.format(max_iters)) # Compute our initial search domain. We optimize the initial search domain # by removing all features that are already at their maximum values (if # increasing input features---otherwise, at their minimum value). if increase: search_domain = set( [i for i in xrange(nb_features) if adv_x[0, i] < clip_max]) else: search_domain = set( [i for i in xrange(nb_features) if adv_x[0, i] > clip_min]) # Initialize the loop variables iteration = 0 adv_x_original_shape = np.reshape(adv_x, original_shape) current = model_argmax(sess, x, predictions, adv_x_original_shape, feed={K.learning_phase(): 0}) # Repeat this main loop until we have achieved misclassification while (current != target and iteration < max_iters and len(search_domain) > 1): # Reshape the adversarial example adv_x_original_shape = np.reshape(adv_x, original_shape) # Compute the Jacobian components grads_target, grads_others = jacobian(sess, x, grads, target, adv_x_original_shape, nb_features, nb_classes, feed={K.learning_phase(): 0}) # Compute the saliency map for each of our target classes # and return the two best candidate features for perturbation i, j, search_domain = saliency_map(grads_target, grads_others, search_domain, increase) # Apply the perturbation to the two input features selected previously adv_x = apply_perturbations(i, j, adv_x, increase, theta, clip_min, clip_max) # Update our current prediction by querying the model current = model_argmax(sess, x, predictions, adv_x_original_shape, feed={K.learning_phase(): 0}) # Update loop variables iteration += 1 # This process may take a while, so outputting progress regularly if iteration % 5 == 0 and verbose: msg = 'Current iteration: {0} - Current Prediction: {1}' print(msg.format(iteration, current)) # Compute the ratio of pixels perturbed by the algorithm percent_perturbed = float(iteration * 2) / nb_features # Report success when the adversarial example is misclassified in the # target class if current == target: if verbose: print('Successful') return np.reshape(adv_x, original_shape), 1, percent_perturbed else: if verbose: print('Unsuccesful') return np.reshape(adv_x, original_shape), 0, percent_perturbed
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, train_dir="/tmp", filename="mnist.ckpt", load_model=False, testing=False): keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) keras.backend.set_session(sess) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() predictions = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } # Train an MNIST model ckpt = tf.train.get_checkpoint_state(train_dir) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path rng = np.random.RandomState([2017, 8, 30]) if load_model and ckpt_path: saver = tf.train.Saver() saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) else: print("Model was not loaded, training from scratch.") model_train(sess, x, y, predictions, X_train, Y_train, evaluate=evaluate, args=train_params, save=True, rng=rng) advGenTimeStart = time.time() wrap = KerasModelWrapper(model) advGenTimeStart = time.time() fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate(x, **fgsm_params) adv_x = sess.run(adv_x, feed_dict={x: X_test[5500:]}) advGenTimeEnd = time.time() advGenTime = advGenTimeEnd - advGenTimeStart for i in xrange(4500): normalization(adv_x[i:(i + 1)]) original_classified_wrong_number = 0 disturbed_failure_number = 0 test_number = 0 TTP = 0 TP = 0 FN = 0 FP = 0 for i in range(len(adv_x)): current_class = int(np.argmax(Y_test[i + 5500])) oriPreTimeStart = time.time() currentXLabel = model_argmax(sess, x, predictions, X_test[i + 5500:(i + 5501)]) currentXProbList = my_model_argmax(sess, x, predictions, X_test[i + 5500:(i + 5501)]) oriPreTimeEnd = time.time() oriPreTime = oriPreTimeEnd - oriPreTimeStart if currentXLabel != current_class: original_classified_wrong_number += 1 continue advPreTimeStart = time.time() currentAdvXLabel = model_argmax(sess, x, predictions, adv_x[i:(i + 1)]) currentAdvXProbList = my_model_argmax(sess, x, predictions, adv_x[i:(i + 1)]) advPreTimeEnd = time.time() advPreTime = advPreTimeEnd - advPreTimeStart if currentAdvXLabel == currentXLabel: disturbed_failure_number += 1 continue # fig = plt.figure('test') # picOne = fig.add_subplot(121) # picOne.imshow(X_test[i+5500:(i+5501)].reshape((28,28)), cmap='gray') # picTwo = fig.add_subplot(122) # picTwo.imshow(adv_x[i:(i+1)].reshape((28,28)), cmap='gray') # plt.show() test_number += 1 print('probabilities = %.4f ; %.4f' % (currentXProbList[currentXLabel], currentAdvXProbList[currentAdvXLabel])) tempX = np.reshape(X_test[i + 5500:(i + 5501)], (28, 28)) test_x = np.array(tempX) oriFilteredPreTimeStart = time.time() currentX = np.reshape(X_test[i + 5500:(i + 5501)], (28, 28)) imageEntropy = oneDEntropy(test_x) if imageEntropy < 4: current_x_res = scalarQuantization(currentX, 128) elif imageEntropy < 5: current_x_res = scalarQuantization(currentX, 64) else: current_x_ASQ = scalarQuantization(currentX, 43) current_x_ASQ_AMF = crossMeanFilterOperations( current_x_ASQ, 3, 25, 13) current_x_res = chooseCloserFilter(currentX, current_x_ASQ, current_x_ASQ_AMF) current_x_res = np.reshape(current_x_res, X_test[0:1].shape) current_x_res_label = model_argmax(sess, x, predictions, current_x_res) current_x_res_prob = my_model_argmax(sess, x, predictions, current_x_res) tempX2 = np.reshape(adv_x[i:(i + 1)], (28, 28)) test_adv_x = np.array(tempX2) currentAdvX = np.reshape(adv_x[i:(i + 1)], (28, 28)) imageEntropy2 = oneDEntropy(test_adv_x) print('%d: %.2f------%.2f' % (i, imageEntropy, imageEntropy2)) if imageEntropy2 < 4: current_adv_x_res = scalarQuantization(currentAdvX, 128) elif imageEntropy2 < 5: current_adv_x_res = scalarQuantization(currentAdvX, 64) else: current_adv_x_ASQ = scalarQuantization(currentAdvX, 43) current_adv_x_ASQ_AMF = crossMeanFilterOperations( current_adv_x_ASQ, 3, 25, 13) current_adv_x_res = chooseCloserFilter(currentAdvX, current_adv_x_ASQ, current_adv_x_ASQ_AMF) current_adv_x_res = np.reshape(current_adv_x_res, X_test[0:1].shape) current_adv_x_res_label = model_argmax(sess, x, predictions, current_adv_x_res) current_adv_x_res_prob = my_model_argmax(sess, x, predictions, current_adv_x_res) print('filtered Probs = %.4f ; %.4f' % (current_x_res_prob[current_x_res_label], current_adv_x_res_prob[current_adv_x_res_label])) if current_adv_x_res_label != currentAdvXLabel: TP += 1 if current_adv_x_res_label == current_class: TTP += 1 else: FN += 1 if current_x_res_label != currentXLabel: FP += 1 str1 = '%d-%d-%d: TP = %d; FN = %d; FP = %d; TTP = %d' % ( test_number, original_classified_wrong_number, disturbed_failure_number, TP, FN, FP, TTP) print(str1) Recall = TP / (TP + FN) Precision = TP / (TP + FP) tempStarStr = '********************************************************' recallStr = 'Recall = %.4f' % (Recall) precisionStr = 'Precision = %.4f' % (Precision) print(tempStarStr) print(recallStr) print(precisionStr) print(tempStarStr)
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, train_dir="/tmp", filename="mnist.ckpt", load_model=False, testing=False): keras.layers.core.K.set_learning_phase(0) report = AccuracyReport() tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) keras.backend.set_session(sess) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() predictions = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } # Train an MNIST model ckpt = tf.train.get_checkpoint_state(train_dir) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path rng = np.random.RandomState([2017, 8, 30]) if load_model and ckpt_path: saver = tf.train.Saver() saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) else: print("Model was not loaded, training from scratch.") model_train(sess, x, y, predictions, X_train, Y_train, evaluate=evaluate, args=train_params, save=True, rng=rng) # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph wrap = KerasModelWrapper(model) advGenTimeStart = time.time() fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.2, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate(x, **fgsm_params) adv_x = sess.run(adv_x, feed_dict={x: X_test[:4500]}) advGenTimeEnd = time.time() advGenTime = advGenTimeEnd - advGenTimeStart for i in xrange(4500): normalization(adv_x[i:(i + 1)]) print('adversarial examples generation time = ', advGenTime, 'seconds') intervals = [128, 85, 64, 51, 43, 37, 32, 28, 26] for intervalIndex in range(9): startTime = time.time() print('NBinterval = ', intervalIndex + 2, '; interval size = ', intervals[intervalIndex]) original_classified_wrong_number = 0 disturbed_failure_number = 0 test_number = 0 TTP = 0 TP = 0 FN = 0 FP = 0 for i in range(1000): current_class = int(np.argmax(Y_test[i])) currentXLabel = model_argmax(sess, x, predictions, X_test[i:(i + 1)]) if currentXLabel != current_class: original_classified_wrong_number += 1 continue currentAdvXLabel = model_argmax(sess, x, predictions, adv_x[i:(i + 1)]) if currentAdvXLabel == currentXLabel: disturbed_failure_number += 1 continue test_number += 1 currentX = np.reshape(X_test[i:(i + 1)], (28, 28)) currentX = scalarQuantization(currentX, intervals[intervalIndex]) currentX = np.reshape(currentX, X_test[i:(i + 1)].shape) currentXFilteredLabel = model_argmax(sess, x, predictions, currentX) currentAdvX = np.reshape(adv_x[i:(i + 1)], (28, 28)) currentAdvX = scalarQuantization(currentAdvX, intervals[intervalIndex]) currentAdvX = np.reshape(currentAdvX, X_test[i:(i + 1)].shape) currentAdvXFilteredLabel = model_argmax(sess, x, predictions, currentAdvX) if currentAdvXFilteredLabel != currentAdvXLabel: TP += 1 if currentAdvXFilteredLabel == current_class: TTP += 1 else: FN += 1 if currentXFilteredLabel != currentXLabel: FP += 1 if (i + 1) % 1000 == 0: str1 = '%d-%d-%d: TP = %d; FN = %d; FP = %d; TTP = %d' % ( test_number, original_classified_wrong_number, disturbed_failure_number, TP, FN, FP, TTP) print(str1) str1 = '%d-%d-%d: TP = %d; FN = %d; FP = %d; TTP = %d' % ( test_number, original_classified_wrong_number, disturbed_failure_number, TP, FN, FP, TTP) print(str1) endTime = time.time() print('lasting ', endTime - startTime, 'seconds') Recall = TP / (TP + FN) Precision = TP / (TP + FP) tempStarStr = '********************************************************' recallStr = 'Recall = %.4f' % (Recall) precisionStr = 'Precision = %.4f' % (Precision) print(tempStarStr) print(recallStr) print(precisionStr) print(tempStarStr) return report
def main(argv=None): """ CIFAR10 CleverHans tutorial :return: """ # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get CIFAR10 test data X_train, Y_train, X_test, Y_test = data_cifar10() assert Y_train.shape[1] == 10. label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model(img_rows=32, img_cols=32, channels=3) predictions = model(x) print "Defined TensorFlow model graph." def evaluate(): # Evaluate the accuracy of the CIFAR10 model on legitimate test # examples eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) assert X_test.shape[0] == 10000, X_test.shape print 'Test accuracy on legitimate test examples: ' + str(accuracy) # Train an CIFAR10 model train_params = { 'nb_epochs': FLAGS.nb_epochs, 'batch_size': FLAGS.batch_size, 'learning_rate': FLAGS.learning_rate } model_train(sess, x, y, predictions, X_train, Y_train, evaluate=evaluate, args=train_params) # Craft adversarial examples using Fast Gradient Sign Method (FGSM) # adv_x = fgsm(x, predictions, eps=0.3) mim = MIM(model, back='tf', sess=sess) mim_params = { 'eps_iter': 0.06, 'eps': 0.3, 'nb_iter': 10, 'ord': 2, 'decay_factor': 1.0 } adv_x = mim.generate(x, **mim_params) eval_params = {'batch_size': FLAGS.batch_size} X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], args=eval_params) assert X_test_adv.shape[0] == 10000, X_test_adv.shape accuracy = model_eval(sess, x, y, predictions, X_test_adv, Y_test, args=eval_params) print 'Test accuracy on adversarial examples: ' + str(accuracy) from scipy.misc import imsave path = '/home/neale/repos/adversarial-toolbox/images/adversarials/mim/cifar/symmetric/' """ for i, (real, adv) in enumerate(zip(X_test, X_test_adv)): imsave(path+'adv/adv_{}.png'.format(i), adv) """ preds = model_argmax(sess, x, predictions, X_test_adv) print Y_test.shape print preds.shape count = 0 for i in range(len(preds)): if np.argmax(Y_test[i]) == preds[i]: # imsave(path+'real/im_{}.png'.format(i), X_test[i]) # imsave(path+'adv/adv_{}.png'.format(i), X_test_adv[i]) count += 1 print "saved ", count
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, train_dir="/tmp", filename="mnist.ckpt", load_model=False, testing=False): keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) keras.backend.set_session(sess) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() predictions = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } # Train an MNIST model ckpt = tf.train.get_checkpoint_state(train_dir) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path rng = np.random.RandomState([2017, 8, 30]) if load_model and ckpt_path: saver = tf.train.Saver() saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) else: print("Model was not loaded, training from scratch.") model_train(sess, x, y, predictions, X_train, Y_train, evaluate=evaluate, args=train_params, save=True, rng=rng) advGenTimeStart = time.time() wrap = KerasModelWrapper(model) advGenTimeStart = time.time() fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.2, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate(x, **fgsm_params) adv_x = sess.run(adv_x, feed_dict={x: X_test[4500:5500]}) advGenTimeEnd = time.time() advGenTime = advGenTimeEnd-advGenTimeStart for i in xrange(1000): normalization(adv_x[i:(i+1)]) original_classified_wrong_number = 0 disturbed_failure_number = 0 NbLowEntropy = 0 NbMidEntropy = 0 NbHighEntropy = 0 lowTP = 0 lowFN = 0 lowFP = 0 midTP = 0 midFN = 0 midFP = 0 highTP = 0 highFN = 0 highFP = 0 for i in range(len(adv_x)): current_class = int(np.argmax(Y_test[4500+i])) oriPreTimeStart = time.time() currentXLabel = model_argmax(sess,x,predictions,X_test[i+4500:(i+4501)]) currentXProbList = my_model_argmax(sess,x,predictions,X_test[i+4500:(i+4501)]) oriPreTimeEnd = time.time() oriPreTime = oriPreTimeEnd-oriPreTimeStart if currentXLabel != current_class: original_classified_wrong_number+=1 continue advPreTimeStart = time.time() currentAdvXLabel = model_argmax(sess,x,predictions,adv_x[i:(i+1)]) currentAdvXProbList = my_model_argmax(sess,x,predictions,adv_x[i:(i+1)]) advPreTimeEnd = time.time() advPreTime = advPreTimeEnd-advPreTimeStart if currentAdvXLabel == currentXLabel: disturbed_failure_number+=1 continue tempX = np.reshape(X_test[i+4500:(i+4501)], (28,28)) test_x = np.array(tempX) oriFilteredPreTimeStart = time.time() currentX = np.reshape(X_test[i+4500:(i+4501)], (28,28)) imageEntropy = oneDEntropy(test_x) if imageEntropy < 4: NbLowEntropy+=1 current_x_res = scalarQuantization(currentX,128) current_x_res = np.reshape(current_x_res, X_test[0:1].shape) current_x_res_label = model_argmax(sess,x,predictions,current_x_res) if current_x_res_label != current_class: lowFP+=1 elif imageEntropy < 5: NbMidEntropy+=1 current_x_res = scalarQuantization(currentX,64) current_x_res = np.reshape(current_x_res, X_test[0:1].shape) current_x_res_label = model_argmax(sess,x,predictions,current_x_res) if current_x_res_label != current_class: midFP+=1 else: NbHighEntropy+=1 current_x_res = scalarQuantization(currentX,43) current_x_res = np.reshape(current_x_res, X_test[0:1].shape) current_x_res_label = model_argmax(sess,x,predictions,current_x_res) if current_x_res_label != current_class: highFP+=1 tempX2 = np.reshape(adv_x[i:(i+1)], (28,28)) test_adv_x = np.array(tempX2) currentAdvX = np.reshape(adv_x[i:(i+1)], (28,28)) imageEntropy2 = oneDEntropy(test_adv_x) print('%d: %.2f------%.2f' % (i, imageEntropy,imageEntropy2)) if imageEntropy2 < 4: NbLowEntropy+=1 current_adv_x_res = scalarQuantization(currentAdvX,128) current_adv_x_res = np.reshape(current_adv_x_res, X_test[0:1].shape) current_adv_x_res_label = model_argmax(sess,x,predictions,current_adv_x_res) if current_adv_x_res_label != currentAdvXLabel: lowTP+=1 else: lowFN+=1 elif imageEntropy2 < 5: NbMidEntropy+=1 current_adv_x_res = scalarQuantization(currentAdvX,64) current_adv_x_res = np.reshape(current_adv_x_res, X_test[0:1].shape) current_adv_x_res_label = model_argmax(sess,x,predictions,current_adv_x_res) if current_adv_x_res_label != currentAdvXLabel: midTP+=1 else: highFN+=1 else: NbHighEntropy+=1 current_adv_x_res = scalarQuantization(currentAdvX,43) current_adv_x_res = np.reshape(current_adv_x_res, X_test[0:1].shape) current_adv_x_res_label = model_argmax(sess,x,predictions,current_adv_x_res) if current_adv_x_res_label != currentAdvXLabel: highTP+=1 else: highFN+=1 str1 = '%d-%d' % (original_classified_wrong_number,disturbed_failure_number) lowstr = '%d : lowTP = %d; lowFN = %d; lowFP = %d' % (NbLowEntropy,lowTP,lowFN,lowFP) midstr = '%d : midTP = %d; midFN = %d; midFP = %d' % (NbMidEntropy,midTP,midFN,midFP) highstr = '%d : highTP = %d; highFN = %d; highFP = %d' % (NbHighEntropy,highTP,highFN,highFP) print(str1) print(lowstr) print(midstr) print(highstr) lowRecall=lowTP*1.0/(lowTP+lowFN) lowPrecision=lowTP*1.0/(lowTP+lowFP) midRecall=midTP*1.0/(midTP+midFN) midPrecision=midTP*1.0/(midTP+midFP) highRecall=highTP*1.0/(highTP+highFN) highPrecision=highTP*1.0/(highTP+highFP) print ("lowRecall: ",lowRecall) print ("lowPrecision: ",lowPrecision) print ("midRecall: ",midRecall) print ("midPrecision: ",midPrecision) print ("highRecall: ",highRecall) print ("highPrecision: ",highPrecision)
def deepfool_attack_L2(sess, x, predictions, logits, grads, sample, nb_candidate, overshoot, max_iter, clip_min, clip_max, feed=None): """ TensorFlow implementation of DeepFool. Paper link: see https://arxiv.org/pdf/1511.04599.pdf #it is said by <On detecting Adversarial Perturbations>, deepfool has L2 and L_infinity versions :param sess: TF session :param x: The input placeholder :param predictions: The model's sorted symbolic output of logits, only the top nb_candidate classes are contained :param logits: The model's unnormalized output tensor (the input to the softmax layer) :param grads: Symbolic gradients of the top nb_candidate classes, procuded from gradient_graph :param sample: Numpy array with sample input :param nb_candidate: The number of classes to test against, i.e., deepfool only consider nb_candidate classes when attacking(thus accelerate speed). The nb_candidate classes are chosen according to the prediction confidence during implementation. :param overshoot: A termination criterion to prevent vanishing updates :param max_iter: Maximum number of iteration for DeepFool :param clip_min: Minimum value for components of the example returned :param clip_max: Maximum value for components of the example returned :return: Adversarial examples """ adv_x = copy.copy(sample) # Initialize the loop variables iteration = 0 current = utils_tf.model_argmax(sess, x, logits, adv_x, feed=feed) if current.shape == (): current = np.array([current]) w = np.squeeze(np.zeros(sample.shape[1:])) # same shape as original image r_tot = np.zeros(sample.shape) original = current # use original label as the reference _logger.debug( "Starting DeepFool attack up to %s iterations", max_iter) # Repeat this main loop until we have achieved misclassification while (np.any(current == original) and iteration < max_iter): # if iteration % 5 == 0 and iteration > 0: # _logger.info("Attack result at iteration %s is %s", iteration, current) gradients = sess.run(grads, feed_dict={x: adv_x}) predictions_val = sess.run(predictions, feed_dict={x: adv_x}) for idx in range(sample.shape[0]): pert = np.inf if current[idx] != original[idx]: continue for k in range(1, nb_candidate): w_k = gradients[idx, k, ...] - gradients[idx, 0, ...] f_k = predictions_val[idx, k] - predictions_val[idx, 0] # adding value 0.00001 to prevent f_k = 0 pert_k = (abs(f_k) + 0.00001) / np.linalg.norm(w_k.flatten()) if pert_k < pert: pert = pert_k w = w_k r_i = pert * w / np.linalg.norm(w.flatten()) r_tot[idx, ...] = r_tot[idx, ...] + r_i adv_x = np.clip(r_tot + sample, clip_min, clip_max) current = utils_tf.model_argmax(sess, x, logits, adv_x, feed=feed) if current.shape == (): current = np.array([current]) # Update loop variables iteration = iteration + 1 # need more revision, including info like how many succeed # _logger.info("Attack result at iteration %s is %s", iteration, current) _logger.info("%s out of %s become adversarial examples at iteration %s", sum(current != original), sample.shape[0], iteration) # need to clip this image into the given range adv_x = np.clip((1 + overshoot) * r_tot + sample, clip_min, clip_max) return adv_x
def do_jsma(): print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = X_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, channels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = X_test[sample_ind].reshape(-1) nb_changed = np.where( adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape( -1).shape[0] # Display the original and adversarial images side-by-side if FLAGS.viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols)), np.reshape(adv_x, (img_rows, img_cols)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, channels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format( succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format( percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) if FLAGS.viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def attack(self, x_val, targets): """ Perform the attack on the given instance for the given targets. """ def lbfgs_objective(adv_x, self, targets, oimgs, CONST): # returns the function value and the gradient for fmin_l_bfgs_b loss = self.sess.run( self.loss, feed_dict={ self.x: adv_x.reshape(oimgs.shape), self.targeted_label: targets, self.ori_img: oimgs, self.const: CONST }) grad = self.sess.run( self.grad, feed_dict={ self.x: adv_x.reshape(oimgs.shape), self.targeted_label: targets, self.ori_img: oimgs, self.const: CONST }) return loss, grad.flatten().astype(float) # begin the main part for the attack from scipy.optimize import fmin_l_bfgs_b oimgs = np.clip(x_val, self.clip_min, self.clip_max) CONST = np.ones(self.batch_size) * self.initial_const # set the lower and upper bounds accordingly lower_bound = np.zeros(self.batch_size) upper_bound = np.ones(self.batch_size) * 1e10 # set the box constraints for the optimization function clip_min = self.clip_min * np.ones(oimgs.shape[:]) clip_max = self.clip_max * np.ones(oimgs.shape[:]) clip_bound = list(zip(clip_min.flatten(), clip_max.flatten())) # placeholders for the best l2 and instance attack found so far o_bestl2 = [1e10] * self.batch_size o_bestattack = np.copy(oimgs) for outer_step in range(self.binary_search_steps): _logger.debug(" Binary search step %s of %s", outer_step, self.binary_search_steps) # The last iteration (if we run many steps) repeat the search once. if self.repeat and outer_step == self.binary_search_steps - 1: CONST = upper_bound # optimization function adv_x, _, __ = fmin_l_bfgs_b( lbfgs_objective, oimgs.flatten().astype(float), args=(self, targets, oimgs, CONST), bounds=clip_bound, maxiter=self.max_iterations, iprint=0) adv_x = adv_x.reshape(oimgs.shape) assert np.amax(adv_x) <= self.clip_max and \ np.amin(adv_x) >= self.clip_min, \ 'fmin_l_bfgs_b returns are invalid' # adjust the best result (i.e., the adversarial example with the # smallest perturbation in terms of L_2 norm) found so far preds = np.atleast_1d( utils_tf.model_argmax(self.sess, self.x, self.logits, adv_x)) _logger.debug("predicted labels are %s", preds) l2s = np.zeros(self.batch_size) for i in range(self.batch_size): l2s[i] = np.sum(np.square(adv_x[i] - oimgs[i])) for e, (l2, pred, ii) in enumerate(zip(l2s, preds, adv_x)): if l2 < o_bestl2[e] and pred == np.argmax(targets[e]): o_bestl2[e] = l2 o_bestattack[e] = ii # adjust the constant as needed for e in range(self.batch_size): if preds[e] == np.argmax(targets[e]): # success, divide const by two upper_bound[e] = min(upper_bound[e], CONST[e]) if upper_bound[e] < 1e9: CONST[e] = (lower_bound[e] + upper_bound[e]) / 2 else: # failure, either multiply by 10 if no solution found yet # or do binary search with the known upper bound lower_bound[e] = max(lower_bound[e], CONST[e]) if upper_bound[e] < 1e9: CONST[e] = (lower_bound[e] + upper_bound[e]) / 2 else: CONST[e] *= 10 _logger.debug(" Successfully generated adversarial examples " "on %s of %s instances.", sum(upper_bound < 1e9), self.batch_size) o_bestl2 = np.array(o_bestl2) mean = np.mean(np.sqrt(o_bestl2[o_bestl2 < 1e9])) _logger.debug(" Mean successful distortion: {:.4g}".format(mean)) # return the best solution found o_bestl2 = np.array(o_bestl2) return o_bestattack
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session sess = tf.Session() # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) model_path = "models/mnist" # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) if clean_train: model = make_basic_cnn(nb_filters=nb_filters) preds = model.get_probs(x) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval( sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval( sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) print(adv_x) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} # Define accuracy symbolically if LooseVersion(tf.__version__) >= LooseVersion('1.0.0'): correct_preds = tf.not_equal(tf.argmax(y, axis=-1), tf.argmax(preds_adv, axis=-1)) else: correct_preds = tf.not_equal(tf.argmax(y, axis=tf.rank(y) - 1), tf.argmax(preds_adv, axis=tf.rank(preds_adv) - 1)) # print("the shape of correct_preds is ", correct_preds.get_shape()) # correct_preds is a boolean Tensor with shape (size,) success_adv_x = tf.boolean_mask(adv_x, correct_preds) success_clean_x = tf.boolean_mask(x, correct_preds) success_clean_y = tf.boolean_mask(y, correct_preds) fgsm_adv_x, fgsm_clean_x, fgsm_clean_y = sess.run([success_adv_x, success_clean_x, success_clean_y], feed_dict={x:X_test,y:Y_test}) np.savez('adversarial_fgsm',adv_examples=fgsm_adv_x, adv_clean_labels=fgsm_clean_y, adv_clean_examples=fgsm_clean_x) print("the shape of adversarial examples we save is ", np.shape(fgsm_adv_x)) print("the shape of clean targets we save is ", np.shape(fgsm_clean_y)) acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on adversarial examples fgsm: %0.4f\n' % acc) report.clean_train_adv_eval = acc adv_x_test_for_save = sess.run(adv_x, {x: X_test}) np.savez("adv_test_fgsm_data.npz", adv_examples=adv_x_test_for_save, adv_clean_labels=Y_test, adv_clean_examples=X_test) # Calculate training error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = make_basic_cnn(nb_filters=nb_filters) preds_2 = model_2(x) fgsm2 = FastGradientMethod(model_2, sess=sess) adv_x_2 = fgsm2.generate(x, **fgsm_params) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x_2 = tf.stop_gradient(adv_x_2) preds_2_adv = model_2(adv_x_2) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training print("pred_adv", preds_2_adv.get_shape()) model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, X_train, Y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### source_samples = 10000 nb_classes = 10 print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model_2, back='tf', sess=sess) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} figure = None # create an array for storing adv examples adv_examples = np.empty([1,28,28,1]) # for target labels adv_targets = np.empty([1,10]) # corresponding clean/correct label adv_clean_labels = np.empty([1,10]) # correspongding clean data adv_clean_examples = np.empty([1,28,28,1]) # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = X_test[sample_ind:(sample_ind+1)] # generate from testing data # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_test[sample_ind])) # generate from testing data target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal # grid_viz_data[current_class, current_class, :, :, :] = np.reshape( # sample, (img_rows, img_cols, channels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) #create fake target one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # print('adv_x\'shape is ', np.shape(adv_x)) # (1,28,28,1) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # if succeeds if res == 1: # append new adv_x to adv_examples array # append sample here, so that the number of times sample is appended mmatches number of adv_ex. adv_examples = np.append(adv_examples, adv_x, axis=0) adv_targets = np.append(adv_targets, one_hot_target, axis=0) adv_clean_labels = np.append(adv_clean_labels, np.expand_dims(Y_test[sample_ind],axis=0), axis=0) # generate from testing data adv_clean_examples = np.append(adv_clean_examples, sample, axis=0) adv_x_reshape = adv_x.reshape(-1) test_in_reshape = X_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') adv_examples = adv_examples[1:,:,:,:] adv_targets = adv_targets[1:,:] adv_clean_labels = adv_clean_labels[1:,:] adv_clean_examples = adv_clean_examples[1:,:,:,:] np.savez('adversarial_jsma_actual_full',adv_examples=adv_examples, adv_targets=adv_targets, adv_clean_labels=adv_clean_labels,adv_clean_examples=adv_clean_examples) print(np.shape(adv_targets)[0], "adversarial examples have been saved.") print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_test_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) return report
def deepfool_attack(sess, x, predictions, logits, sample, nb_candidate=10, overshoot=0.02, max_iter=50, clip_min=0.0, clip_max=1.0, feed=None): """ TensorFlow implementation of DeepFool. Paper link: see https://arxiv.org/pdf/1511.04599.pdf :param sess: TF session :param x: The input placeholder :param predictions: The model's sorted symbolic output of logits, only the top nb_candidate classes are contained :param logits: The model's unnormalized output tensor (the input to the softmax layer) :param grads: Symbolic gradients of the top nb_candidate classes, procuded from gradient_graph :param sample: Numpy array with sample input :param nb_candidate: The number of classes to test against, i.e., deepfool only consider nb_candidate classes when attacking (thus accelerate speed) :param overshoot: A termination criterion to prevent vanishing updates :param max_iter: Maximum number of iteration for DeepFool :param clip_min: Minimum value for components of the example returned :param clip_max: Maximum value for components of the example returned :return: an adversarial sample """ import copy adv_x = copy.copy(sample) # Initialize the loop variables iteration = 0 current = utils_tf.model_argmax(sess, x, logits, adv_x, feed=feed) if current.shape == (): current = np.array([current]) w = np.squeeze(np.zeros(sample.shape[1:4])) # same shape as original image r_tot = np.zeros(sample.shape) original = current # use original label as the reference grads = jacobian_graph(predictions, x, nb_candidate) # Repeat this main loop until we have achieved misclassification while (np.any(current == original) and iteration < max_iter): feed.update({x: adv_x}) gradients = sess.run(grads, feed_dict=feed) predictions_val = sess.run(predictions, feed_dict=feed) for idx in range(sample.shape[0]): pert = np.inf if current[idx] != original[idx]: continue for k in range(1, nb_candidate): w_k = gradients[k][idx, ...] - gradients[0][idx, ...] f_k = predictions_val[idx, k] - predictions_val[idx, 0] # adding value 0.00001 to prevent f_k = 0 pert_k = (abs(f_k) + 1e-30) / np.linalg.norm(w_k.flatten()) if pert_k < pert: pert = pert_k w = w_k r_i = pert*w/np.linalg.norm(w) r_tot[idx, ...] = r_tot[idx, ...] + r_i # adv_x = np.clip(r_tot + sample, clip_min, clip_max) adv_x = r_tot + sample feed.update({x: adv_x}) current = utils_tf.model_argmax(sess, x, logits, adv_x, feed=feed) if current.shape == (): current = np.array([current]) # Update loop variables iteration = iteration + 1 # need to clip this image into the given range # adv_x = np.clip((1+overshoot)*r_tot + sample, clip_min, clip_max) adv_x = (1 + overshoot) * r_tot + sample return adv_x
def jsma(self, sess, x, predictions, grads, sample, target, theta, gamma, clip_min, clip_max, feed=None): """ TensorFlow implementation of the JSMA (see https://arxiv.org/abs/1511.07528 for details about the algorithm design choices). :param sess: TF session :param x: the input placeholder :param predictions: the model's symbolic output (the attack expects the probabilities, i.e., the output of the softmax, but will also work with logits typically) :param grads: symbolic gradients :param sample: numpy array with sample input :param target: target class for sample input :param theta: delta for each feature adjustment :param gamma: a float between 0 - 1 indicating the maximum distortion percentage :param clip_min: minimum value for components of the example returned :param clip_max: maximum value for components of the example returned :return: an adversarial sample """ # Copy the source sample and define the maximum number of features # (i.e. the maximum number of iterations) that we may perturb adv_x = copy.copy(sample) # count the number of features. For MNIST, 1x28x28 = 784; for # CIFAR, 3x32x32 = 3072; etc. nb_features = np.product(adv_x.shape[1:]) # reshape sample for sake of standardization original_shape = adv_x.shape adv_x = np.reshape(adv_x, (1, nb_features)) # compute maximum number of iterations max_iters = np.floor(nb_features * gamma / 2) # Find number of classes based on grads nb_classes = len(grads) increase = bool(theta > 0) # Compute our initial search domain. We optimize the initial search domain # by removing all features that are already at their maximum values (if # increasing input features---otherwise, at their minimum value). if increase: search_domain = set([i for i in range(nb_features) if adv_x[0, i] < clip_max]) else: search_domain = set([i for i in range(nb_features) if adv_x[0, i] > clip_min]) # Initialize the loop variables iteration = 0 adv_x_original_shape = np.reshape(adv_x, original_shape) current = utils_tf.model_argmax(sess, x, predictions, adv_x_original_shape, feed=feed) # charlee: Used to log when the model gets confused orig_label = current confused_at = 0 success_at = 0 # must have at least 10 successful pred to be judged as successful confuse_count = 0 success_count = 0 logger.debug("Starting JSMA attack up to {} iterations".format(max_iters)) # Repeat this main loop until we have achieved misclassification while (success_at == 0 and iteration < max_iters and len(search_domain) > 1): # Reshape the adversarial example adv_x_original_shape = np.reshape(adv_x, original_shape) # Compute the Jacobian components grads_target, grads_others = jacobian(sess, x, grads, target, adv_x_original_shape, nb_features, nb_classes, feed=feed) if iteration % ((max_iters + 1) // 5) == 0 and iteration > 0: logger.debug("Iteration {} of {}".format(iteration, int(max_iters))) # Compute the saliency map for each of our target classes # and return the two best candidate features for perturbation i, j, search_domain = saliency_map( grads_target, grads_others, search_domain, increase) # Apply the perturbation to the two input features selected previously adv_x = apply_perturbations( i, j, adv_x, increase, theta, clip_min, clip_max) # Update our current prediction by querying the model current = utils_tf.model_argmax(sess, x, predictions, adv_x_original_shape, feed=feed) # Update loop variables iteration = iteration + 1 # charlee: Record the iternation when model gets confused if current != orig_label and confused_at == 0: confuse_count += 1 if confuse_count >= 1: confused_at = iteration else: confuse_count = 0 if current == target: success_count += 1 if success_count >= 1: logger.info("Attack succeeded using {} iterations".format(iteration)) success_at = iteration else: success_count = 0 if success_at == 0: logger.info(("Failed to find adversarial example " + "after {} iterations").format(iteration)) # Compute the ratio of pixels perturbed by the algorithm percent_perturbed = float(iteration * 2) / nb_features confused_at = float(confused_at * 2) / nb_features success_at = float(success_at * 2) / nb_features # Report success when the adversarial example is misclassified in the # target class return np.reshape(adv_x, original_shape), percent_perturbed, confused_at, success_at, orig_label, current
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=10, learning_rate=0.001): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility tf.set_random_seed(4254264) set_log_level(logging.DEBUG) # Get MNIST test data # X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, # train_end=train_end, # test_start=test_start, # test_end=test_end) # Get notMNIST data # with np.load("notmnist.npz") as data: # X_train, Y_train, X_test, Y_test = data['examples_train'], data['labels_train'], data['examples_test'], data['labels_test'] # Get MNISTnotMNIST data with np.load("mnist.npz") as data: X_train, Y_train, X_test, Y_test = data['X_train'], data[ 'Y_train'], data['X_test'], data['Y_test'] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") # Define TF model graph model_path = "./" model_name = "clean_trained_mnist_model" model = make_basic_cnn(nb_classes=nb_classes) if tf_model_load(sess, file_path=os.path.join(model_path, model_name)): print(model_name, " reloaded.") preds = model.get_probs(x) # print('shape is', preds.get_shape()) # clean_train = True # if clean_train: # train_params = { # 'nb_epochs': nb_epochs, # 'batch_size': batch_size, # 'learning_rate': learning_rate # } # model_path = "./" # model_name = "clean_trained__model_notmnist" # rng = np.random.RandomState([1989, 12, 13]) # model = make_basic_cnn() # preds = model.get_probs(x) # # def evaluate(): # # Evaluate the accuracy of the MNIST model on legitimate test # # examples # eval_params = {'batch_size': batch_size} # acc = model_eval( # sess, x, y, preds, X_test, Y_test, args=eval_params) # report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape # print('Test accuracy on legitimate examples: %0.4f' % acc) # model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate,args=train_params, rng=rng) # # save_path = os.path.join(model_path, model_name) # saver = tf.train.Saver() # saver.save(sess, save_path) # _logger.info("Completed model training and saved at: " + str(save_path)) # print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model # train_params = { # 'nb_epochs': nb_epochs, # 'batch_size': batch_size, # 'learning_rate': learning_rate, # 'train_dir': model_path, # 'filename': model_name # } # sess.run(tf.global_variables_initializer()) # rng = np.random.RandomState([2017, 8, 30]) # model_train(sess, x, y, preds, X_train, Y_train, save=True, args=train_params, # rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) # report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # misclassify results2 = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization # grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) # grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = { 'theta': 1, 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } figure = None rng = np.random.RandomState([1358, 23, 234]) index_shuf = list(range(len(X_test))) rng.shuffle(index_shuf) X_test = X_test[index_shuf] Y_test = Y_test[index_shuf] # create a dictionary to keep track of occurence of each letter # create a 2D array to kee track of successful attacks occurence = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0} # 10:0, 11:0, 12:0, 13:0, 14:0, 15:0, 16:0, 17:0, 18:0, 19:0} rate_table = np.zeros((nb_classes, nb_classes), dtype='f') # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = X_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # add one to current class occurence occurence[current_class] += 1 # For the grid visualization, keep original images along the diagonal # grid_viz_data[current_class, current_class, :, :, :] = np.reshape( # sample, (img_rows, img_cols, channels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # misclassify res2 = int(model_argmax(sess, x, preds, adv_x) != current_class) # if success, add one to successful rate table if res == 1: rate_table[current_class, target] += 1. # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = X_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side # if viz_enabled: # figure = pair_visual( # np.reshape(sample, (img_rows, img_cols)), # np.reshape(adv_x, (img_rows, img_cols)), figure) # Add our adversarial example to our grid data # grid_viz_data[target, current_class, :, :, :] = np.reshape( # adv_x, (img_rows, img_cols, channels)) # Update the arrays for later analysis results[target, sample_ind] = res results2[target, sample_ind] = res2 perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Close TF session sess.close() # Compute success rate of each letter attacking each target for cur in range(nb_classes): if occurence[cur] != 0: rate_table[cur, :] /= float(occurence[cur]) print("The table of rate of successful attacking is shown below") print(rate_table) print("the number of occurrence of each class is ", occurence) # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried # misclassify succ_rate2 = float(np.sum(results2)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) print( 'Avg. rate of misclassified adv. examples {0:.4f}'.format(succ_rate2)) # report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Finally, block & display a grid of all the adversarial examples # if viz_enabled: # import matplotlib.pyplot as plt # plt.close(figure) # _ = grid_visual(grid_viz_data) return report
def mnist_tutorial_jsma( train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, ): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST( train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end, ) x_train, y_train = mnist.get_set("train") x_test, y_test = mnist.get_set("test") # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN("model1", nb_classes, nb_filters) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { "nb_epochs": nb_epochs, "batch_size": batch_size, "learning_rate": learning_rate, } sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) train(sess, loss, x_train, y_train, args=train_params, rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {"batch_size": batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print("Test accuracy on legitimate test examples: {0}".format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print("Crafting " + str(source_samples) + " * " + str(nb_classes - 1) + " adversarial examples") # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype="i") # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype="f") # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype="f") # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, sess=sess) jsma_params = { "theta": 1.0, "gamma": 0.1, "clip_min": 0.0, "clip_max": 1.0, "y_target": None, } figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print("--------------------------------------") print("Attacking input %i/%i" % (sample_ind + 1, source_samples)) sample = x_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, nchannels)) # Loop over all target classes for target in target_classes: print("Generating adv. example for target class %i" % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params["y_target"] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Compute number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = x_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, nchannels)), np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure, ) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, nchannels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print("--------------------------------------") # Compute the number of adversarial examples that were successfully found nb_targets_tried = (nb_classes - 1) * source_samples succ_rate = float(np.sum(results)) / nb_targets_tried print("Avg. rate of successful adv. examples {0:.4f}".format(succ_rate)) report.clean_train_adv_eval = 1.0 - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations[np.where(perturbations != 0)]) print("Avg. rate of perturbed features {0:.4f}".format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean( perturbations[np.where(perturbations != 0)] * (results[np.where(perturbations != 0)] == 1)) print("Avg. rate of perturbed features for successful " "adversarial examples {0:.4f}".format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def minist_fgsm_saliency( train_start=0, train_end=10, test_start=0, test_end=5, nb_epochs=2, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64, nb_classes=10, source_samples=10, ): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session sess = tf.Session() # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 # this way, all the 9 zeroes -> 0.1/9 because # the one-bit becomes 0.9 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # placeholder for y_target --> for saliency tensor y_target = tf.placeholder(tf.float32, shape=(None, 10)) model_path = "models/mnist" # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) ########################################################################### # Training the CNN model using TensorFlow: model --> base model ########################################################################### model = make_basic_cnn(nb_filters=nb_filters) preds = model.get_probs(x) if clean_train: # omg -> creates a cnn model # model = make_basic_cnn(nb_filters=nb_filters) # preds = model.get_probs(x) def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test # examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) ########################################################################### # MODEL Train!!!!!!!!!!!! ########################################################################### # training the basic model, using train_params model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate, args=train_params, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, X_train, Y_train, args=eval_params) report.train_clean_train_clean_eval = acc ########################################################################### # Generate FGSM Adversarial based on model, and # Compute Base Model Accuracy ########################################################################### # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) # todo: follow the paper and run Cleverhans Output? fgsm_params_y = {'eps': 0.3, 'y': y, 'clip_min': 0., 'clip_max': 1.} #adv_x = fgsm.generate(x, **fgsm_params) adv_x = fgsm.generate(x, **fgsm_params_y) preds_adv = model.get_probs(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par) print('Test accuracy on FGSM adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculate training error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, X_train, Y_train, args=eval_par) report.train_clean_train_adv_eval = acc ########################################################################### # Generate Saliency Map Adversarial Example and # Compute base model accuracy (only 10) ########################################################################### print("Saliency Map Attack On The Base Model") print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Instantiate a SaliencyMapMethod attack object --> modify y_target for each test_data again jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } # Keep track of success (adversarial example classified in target) # Need this info to compute the success rate results = np.zeros((nb_classes, source_samples), dtype='i') # each sample will get 9 adversarial samples # adv_x_set: place_holder for all the x variations # correct_y_set: correct_y_output used for training adv_x_set = None adv_y_target = None # we need multi x_train_saliency / y_train_saliency # x_train_saliency = None y_train_saliency = None for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Saliency Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = X_train[sample_ind:(sample_ind + 1)] y_sample = Y_train[sample_ind:(sample_ind + 1)] current_class = int(np.argmax(Y_train[sample_ind])) target_classes = other_classes(nb_classes, current_class) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # Create x_train_saliency, corresponding to y_train_saliency if x_train_saliency is not None: x_train_saliency = np.concatenate( (x_train_saliency, sample), axis=0) y_train_saliency = np.concatenate( (y_train_saliency, y_sample), axis=0) else: x_train_saliency = sample y_train_saliency = y_sample print("sample shape: ", x_train_saliency.shape) print("y_sample shape: ", y_train_saliency.shape) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x_np = jsma.generate_np(sample, **jsma_params) # Add to adv_x_set, correct_y_set if adv_x_set is not None: adv_y_target = np.concatenate( (adv_y_target, one_hot_target), axis=0) adv_x_set = np.concatenate((adv_x_np, adv_x_set), axis=0) else: adv_y_target = one_hot_target adv_x_set = adv_x_np print("adv_y_target shape(one-hot-encoding): ", adv_y_target.shape) print("adv_x_set(np) shape: ", adv_x_np.shape) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x_np) == target) # Update the arrays for later analysis results[target, sample_ind] = res print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful Saliency adv. examples {0:.4f}'.format( succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # here we have successfully stacked up x_adversarial_set, y_correct_set # these can be used to provide training to our model now print("\n\n\n*****************************") print("Checking x_adv_set shape: ", adv_x_set.shape) print("Checking correct_y_set shape: ", adv_y_target.shape) print("x_training_saliency shape:", x_train_saliency.shape) print("y_training_saliency shape:", y_train_saliency.shape) # now construct model 3, define output -> input relationship tensor model_3 = make_basic_cnn(nb_filters=nb_filters) # define the x, the placeholder input - > preds_3 output preds_3 = model_3(x) # jsma3 = SaliencyMapMethod(model_3, sess=sess) # # jsma_params = {'theta': 1., 'gamma': 0.1, # 'clip_min': 0., 'clip_max': 1., # 'y_target': y_target} # # # create adv_saliency set tensor, using x_train data and jsma_params containing adv_y_target # adv_jsma = jsma3.generate(x, jsma_params) # # create adv preds tensor # preds_jsma_adv = model_3(adv_jsma) # define saliency training model accuracy def evaluate_saliency(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_3, x_train_saliency, y_train_saliency, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy ########################################################################### # MODEL Train for Saliency Map ########################################################################### # Perform and evaluate adversarial training with FSGM MODEL!!! # Train the model with samples of normal and adversarial examples! model_train(sess, x, y, model_3, x_train_saliency, y_train_saliency, evaluate=evaluate_saliency(), args=train_params, rng=rng) #todo: use jsma to create adversarial testing??? or training??? # Redefine TF model FGSM!!! model_2 = make_basic_cnn(nb_filters=nb_filters) preds_2 = model_2(x) fgsm2 = FastGradientMethod(model_2, sess=sess) # parameter for FGSM fgsm_params_y = {'eps': 0.3, 'y': y, 'clip_min': 0., 'clip_max': 1.} adv_x_2 = fgsm2.generate(x, **fgsm_params_y) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x_2 = tf.stop_gradient(adv_x_2) preds_2_adv = model_2(adv_x_2) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, X_test, Y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy ########################################################################### # MODEL Train for FGSM ########################################################################### # Perform and evaluate adversarial training with FSGM MODEL!!! model_train(sess, x, y, preds_2, X_train, Y_train, predictions_adv=preds_2_adv, evaluate=evaluate_2, args=train_params, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, X_train, Y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, train_dir="/tmp", filename="mnist.ckpt", load_model=False, testing=False): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1) # config = tf.ConfigProto(gpu_options=gpu_options) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) keras.backend.set_session(sess) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use label smoothing assert Y_train.shape[1] == 10 label_smooth = .1 Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() predictions = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) report.clean_train_clean_eval = acc assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } # Train an MNIST model ckpt = tf.train.get_checkpoint_state(train_dir) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path rng = np.random.RandomState([2017, 8, 30]) if load_model and ckpt_path: saver = tf.train.Saver() saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) else: print("Model was not loaded, training from scratch.") model_train(sess, x, y, predictions, X_train, Y_train, evaluate=evaluate, args=train_params, save=True, rng=rng) # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph wrap = KerasModelWrapper(model) advGenTimeStart = time.time() fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.2, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate(x, **fgsm_params) adv_x = sess.run(adv_x, feed_dict={x: X_test[:4500]}) advGenTimeEnd = time.time() advGenTime = advGenTimeEnd - advGenTimeStart for i in xrange(4500): normalization(adv_x[i:(i + 1)]) print('adversarial examples generation time = ', advGenTime, 'seconds') crosses = [ np.array([[0, 1, 0], [1, 1, 1], [0, 1, 0]]), np.array([[0, 0, 1, 0, 0], [0, 0, 1, 0, 0], [1, 1, 1, 1, 1], [0, 0, 1, 0, 0], [0, 0, 1, 0, 0]]), np.array([[0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0]]), np.array([ [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0], ]) ] coefficient = [5, 9, 13, 17] #diamond filter test, kernel size: 3, 5, 7, 9 kernelIndex = -1 for kernelSize in xrange(3, 10, 2): startTime = time.time() original_classified_wrong_number = 0 disturbed_failure_number = 0 test_number = 0 TTP = 0 TP = 0 FN = 0 FP = 0 start = (kernelSize - 1) // 2 end = 28 - start kernelIndex += 1 print('cross filter') print(crosses[kernelIndex]) for i in range(4500): current_class = int(np.argmax(Y_test[i])) currentXLabel = model_argmax(sess, x, predictions, X_test[i:(i + 1)]) if currentXLabel != current_class: original_classified_wrong_number += 1 continue currentAdvXLabel = model_argmax(sess, x, predictions, adv_x[i:(i + 1)]) if currentAdvXLabel == currentXLabel: disturbed_failure_number += 1 continue test_number += 1 currentX = np.reshape(X_test[i:(i + 1)], (28, 28)) currentX = diamondAndCrossFilterOperations( currentX, crosses[kernelIndex], start, end, coefficient[kernelIndex]) currentX = np.reshape(currentX, X_test[i:(i + 1)].shape) currentXFilteredLabel = model_argmax(sess, x, predictions, currentX) currentAdvX = np.reshape(adv_x[i:(i + 1)], (28, 28)) currentAdvX = diamondAndCrossFilterOperations( currentAdvX, crosses[kernelIndex], start, end, coefficient[kernelIndex]) currentAdvX = np.reshape(currentAdvX, X_test[i:(i + 1)].shape) currentAdvXFilteredLabel = model_argmax(sess, x, predictions, currentAdvX) if currentAdvXFilteredLabel != currentAdvXLabel: TP += 1 if currentAdvXFilteredLabel == current_class: TTP += 1 else: FN += 1 if currentXFilteredLabel != currentXLabel: FP += 1 if (i + 1) % 1000 == 0: str1 = '%d-%d-%d: TP = %d; FN = %d; FP = %d; TTP = %d' % ( test_number, original_classified_wrong_number, disturbed_failure_number, TP, FN, FP, TTP) print(str1) str1 = '%d-%d-%d: TP = %d; FN = %d; FP = %d; TTP = %d' % ( test_number, original_classified_wrong_number, disturbed_failure_number, TP, FN, FP, TTP) print(str1) endTime = time.time() print('lasting ', endTime - startTime, 'seconds') Recall = TP / (TP + FN) Precision = TP / (TP + FP) tempStarStr = '********************************************************' recallStr = 'Recall = %.4f' % (Recall) precisionStr = 'Precision = %.4f' % (Precision) print(tempStarStr) print(recallStr) print(precisionStr) print(tempStarStr) return report
# Only target the normal class for target in [0]: if current_class == 0: break print('Generating adv. example for target class {} for sample {}'.format(target, sample_ind), end='\r') # Run the Jacobian-based saliency map approach one_hot_target = np.zeros((1, FLAGS.nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, predictions, adv_x) == target) # Compute number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = X_test_scaled[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] X_adv[sample_ind] = adv_x results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print() print(X_adv.shape) print("=========================== Evaluation of MLP Performance ==============================")
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session #replace num_threads = None if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) #with sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data mnist = MNIST(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = mnist.get_set('train') x_test, y_test = mnist.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = make_basic_picklable_cnn() preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } dataset = tf.data.Dataset.from_tensor_slices( (tf.reshape(x_train, [60000, 28, 28]), y_train)) dataset = dataset.batch(32) val_dataset = tf.data.Dataset.from_tensor_slices( (tf.reshape(x_test, [10000, 28, 28]), y_test)) val_dataset = val_dataset.batch(32) sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) if TRAIN_NEW == 1: with sess.as_default(): train(sess, loss, x_train, y_train, args=train_params, rng=rng) save("test.joblib", model) else: with sess.as_default(): model = load("test.joblib") #changed assert len(model.get_params()) > 0 preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, sess=sess) jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } figure = None # Loop over the samples we want to perturb into adversarial examples seed(SEED) for sample_ind in xrange(0, source_samples): img = randint(0, 10000) print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = x_test[img:(img + 1)] #sample = x_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax( y_test[img])) #current_class = int(np.argmax(y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, nchannels)) tn = 0 totc = 0 # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Compute number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = x_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] diff = np.array(adv_x - sample) #print(np.sum(diff)) diff = np.reshape(diff, (28, 28)) diff = diff * 255 cv2.imwrite("test.png", diff) diff = cv2.imread("test.png") diff = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY) nieghbors = 0 tc = 0 for i in range(0, 28, 1): for j in range(0, 28, 1): if diff[i, j] > 0: tc = tc + 1 totc = totc + 1 if i > 0 and i < 27 and j > 0 and j < 27: #main grid not edges or corners if diff[i - 1, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j + 1] > 0: nieghbors = nieghbors + 1 if diff[i, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i, j + 1] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j + 1] > 0: nieghbors = nieghbors + 1 else: #corners if i == 0 and j == 0: if diff[i, j + 1] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j] > 0: nieghbors = nieghbors + 1 if i == 27 and j == 0: if diff[i, j + 1] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j] > 0: nieghbors = nieghbors + 1 if i == 0 and j == 27: if diff[i, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j] > 0: nieghbors = nieghbors + 1 if i == 27 and j == 27: if diff[i, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j] > 0: nieghbors = nieghbors + 1 #edges if i == 0 and j > 0 and j < 27: #left side if diff[i, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i, j + 1] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j + 1] > 0: nieghbors = nieghbors + 1 if i == 27 and j > 0 and j < 27: #right side if diff[i, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i, j + 1] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j + 1] > 0: nieghbors = nieghbors + 1 if j == 0 and i > 0 and i < 27: #top side if diff[i - 1, j] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j + 1] > 0: nieghbors = nieghbors + 1 if diff[i, j + 1] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j + 1] > 0: nieghbors = nieghbors + 1 if j == 27 and i > 0 and i < 27: #bot side if diff[i - 1, j] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j] > 0: nieghbors = nieghbors + 1 if diff[i - 1, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i, j - 1] > 0: nieghbors = nieghbors + 1 if diff[i + 1, j - 1] > 0: nieghbors = nieghbors + 1 # print(tc) # print(nieghbors) tn = tn + nieghbors # if tc > 0: # print(nieghbors/tc) # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, nchannels)), np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, nchannels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb #print(perturbations[target, sample_ind]) print('--------------------------------------') print("average neighbors per modified pixel ", tn / totc) # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.8f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) s = perturbations.shape myPert = np.empty(0) myResults = np.empty(0) for i in range(s[0]): for j in range(s[1]): if perturbations[i][j] > 0: myPert = np.append(myPert, perturbations[i][j]) myResults = np.append(myResults, results[i][j]) min_perturbed = np.min(myPert) max_perturbed = np.max(myPert) s2 = myResults.shape final = np.empty(0) for i in range(s2[0]): if myResults[i] > 0: final = np.append(final, myPert[i]) print('Avg. rate of perturbed features {0:.8f}'.format(percent_perturbed)) print('MIN of perturbed features {0:.8f}'.format(min_perturbed)) print('MAX of perturbed features {0:.8f}'.format(max_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) min_perturb_succ = np.min(final) max_perturb_succ = np.max(final) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.8f}'.format(percent_perturb_succ)) print('Min of perturbed features for successful ' 'adversarial examples {0:.8f}'.format(min_perturb_succ)) print('Max of perturbed features for successful ' 'adversarial examples {0:.8f}'.format(max_perturb_succ)) #Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, source_samples=10, learning_rate=0.001): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) train(sess, loss, x, y, x_train, y_train, args=train_params, rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = x_test[sample_ind:(sample_ind+1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, nchannels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = x_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, nchannels)), np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, nchannels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def main(argv=None): """ CIFAR10 CleverHans tutorial :return: """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # CIFAR10-specific dimensions img_rows = 32 img_cols = 32 channels = 3 nb_classes = 10 # Set TF random seed to improve reproducibility tf.set_random_seed(1234) sess = tf.Session() set_log_level(logging.DEBUG) # Get CIFAR10 test data X_train, Y_train, X_test, Y_test = data_cifar10() # Label smoothing assert Y_train.shape[1] == 10. # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels)) y = tf.placeholder(tf.float32, shape=(None, 10)) model_path = FLAGS.model_path nb_samples = FLAGS.nb_samples from cnn_models import make_basic_cnn model = make_basic_cnn('fp_', input_shape=(None, img_rows, img_cols, channels), nb_filters=FLAGS.nb_filters) preds = model(x) print("Defined TensorFlow model graph with %d parameters" % model.n_params) rng = np.random.RandomState([2017, 8, 30]) def evaluate(eval_params): # Evaluate the model on legitimate test examples acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) return acc model_load(sess, model_path) print('Restored model from %s' % model_path) eval_params = {'batch_size': FLAGS.batch_size} accuracy = evaluate(eval_params) print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(nb_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, nb_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, nb_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') from cleverhans.attacks import SaliencyMapMethod jsma = SaliencyMapMethod(model, sess=sess) jsma_params = { 'gamma': FLAGS.gamma, 'theta': 1., 'symbolic_impl': True, 'clip_min': 0., 'clip_max': 1., 'y_target': None } figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in range(0, nb_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, nb_samples)) sample = X_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, channels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = X_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if FLAGS.viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, channels)), np.reshape(adv_x, (img_rows, img_cols, channels)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, channels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * nb_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if FLAGS.viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data)
def gtsrb_blackbox(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_classes=NB_CLASSES, batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE, nb_epochs=NB_EPOCHS, holdout=HOLDOUT, data_aug=DATA_AUG, nb_epochs_s=NB_EPOCHS_S, lmbda=LMBDA, aug_batch_size=AUG_BATCH_SIZE): """ MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697 :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :return: a dictionary with: * black-box model accuracy on test set * substitute model accuracy on test set * black-box model accuracy on adversarial examples transferred from the substitute model """ # Set logging level to see debug information set_log_level(logging.DEBUG) # Dictionary used to keep track and return key accuracies accuracies = {} # Perform tutorial setup assert setup_tutorial() # Create TF session sess = tf.Session() t1 = time.time() x_train, y_train, x_VAL, y_VAL, x_test, y_test = read_gtsrb_dataset() print('Data reading time :', time.time()-t1, 'seconds') # Initialize substitute training set reserved for adversary x_sub = x_test[:holdout] savefigfromarray(x_sub[0],filename = 'my2.ppm') #y_sub = np.argmax(y_test[:holdout], axis=1) y_sub = y_test[:holdout] print(x_sub.shape) print(y_sub.shape) print(x_train.shape) print(y_train.shape) print(x_test.shape) print(y_test.shape) # Redefine test set as remaining samples unavailable to adversaries x_test = x_test[holdout:] y_test = y_test[holdout:] # Obtain Image parameters nchannels, img_rows, img_cols = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, nchannels, img_rows, img_cols)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Seed random number generator so tutorial is reproducible rng = np.random.RandomState([2017, 8, 30]) # Simulate the black-box model locally print("Loading the black-box model.") t1 = time.time() prep_bbox_out = prep_bbox(sess, x, y, x_train, y_train, x_test, y_test, nb_epochs, batch_size, learning_rate, rng, nb_classes, img_rows, img_cols, nchannels) model, bbox_preds, accuracies['bbox'] = prep_bbox_out print(bbox_preds.shape) print('Oracle loading time :', time.time()-t1, 'seconds') # Evaluate oracle on random noised test samples rand_x_test, rand_y_test = [], y_test try: rand_x_test = np.load('rand_x_test.npy') except: for itest in range(len(x_test)): rand_x_test.append(add_gaussian_noise(x_test[itest], std=0.1)) rand_x_test = np.array(rand_x_test) eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, bbox_preds, rand_x_test, rand_y_test, args=eval_params) accuracies['oracle on noise'] = acc # Train substitute using method from https://arxiv.org/abs/1602.02697 print("Training the substitute model.") t1 = time.time() train_sub_out = train_sub(sess, x, y, bbox_preds, x_train, y_train, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda, aug_batch_size, rng, img_rows, img_cols, nchannels) print('Substitute training time :', time.time()-t1, 'seconds') model_sub, preds_sub = train_sub_out print(preds_sub.shape) # Evaluate the substitute model on clean test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_sub, x_train, y_train, args=eval_params) accuracies['sub'] = acc print('sub on clean test {0}'.format(acc)) # Initialize the Fast Gradient Sign Method (FGSM) attack object. fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.} fgsm = FastGradientMethod(model_sub, sess=sess) # Craft adversarial examples using the substitute t1 = time.time() eval_params = {'batch_size': batch_size} x_adv_sub = fgsm.generate(x, **fgsm_par) print('Adversarial example crafting time :', time.time()-t1, 'seconds') # Evaluate the accuracy of the "black-box" model on adversarial examples accuracy = model_eval(sess, x, y, model.get_logits(x_adv_sub), x_test, y_test, args=eval_params) print('Test accuracy of oracle on adversarial examples generated ' 'using the substitute: ' + str(accuracy)) accuracies['bbox_on_sub_adv_ex'] = accuracy # Visualize one example: x_adv_sub_0 = x_adv_sub.eval(session=sess, feed_dict = {x:x_test[0].reshape(1,3,48,48)}) print('ONE EXMAPLE: shape = {0}'.format(x_adv_sub_0.shape)) print('symbolic x_adv_sub: shape = {0}'.format(x_adv_sub.shape)) np.save('x_adv_sub_0', x_adv_sub_0) ########################################################################### # Visualize adversarial examples as a grid of pictures. ########################################################################### source_samples = 10 img_rows = 48 img_cols = 48 nchannels = 3 print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object # jsma = SaliencyMapMethod(model, back='tf', sess=sess) # jsma_params = {'theta': 1., 'gamma': 0.1, # 'clip_min': 0., 'clip_max': 1., # 'y_target': None} figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = x_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, nchannels)) # Loop over all target classes for target in target_classes[:3]: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 # jsma_params['y_target'] = one_hot_target #adv_x = jsma.generate_np(sample, **jsma_params) adv_x = fgsm.generate_np(sample, **fgsm_par) # Check if success was achieved res = int(model_argmax(sess, x, preds_sub, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = x_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side fig1 = pair_visual( np.reshape(sample, (img_rows, img_cols, nchannels)), np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure) # Add our adversarial example to our grid data fig2 = grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, nchannels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb fig1.savefig('fig1.png') np.save('fig2.png', fig2) print('--------------------------------------') return accuracies
def main(argv=None): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :return: """ # Disable Keras learning phase since we will be serving through tensorflow keras.layers.core.K.set_learning_phase(0) # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Image dimensions ordering should follow the Theano convention if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' " "to 'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) print("Created TensorFlow session and set Keras backend.") # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist() print("Loaded MNIST test data.") # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = cnn_model() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model if it does not exist in the train_dir folder saver = tf.train.Saver() save_path = os.path.join(FLAGS.train_dir, FLAGS.filename) if os.path.isfile(save_path): saver.restore(sess, os.path.join(FLAGS.train_dir, FLAGS.filename)) else: train_params = { 'nb_epochs': FLAGS.nb_epochs, 'batch_size': FLAGS.batch_size, 'learning_rate': FLAGS.learning_rate } model_train(sess, x, y, preds, X_train, Y_train, args=train_params) saver.save(sess, save_path) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': FLAGS.batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == 10000, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(FLAGS.source_samples) + ' * ' + str(FLAGS.nb_classes-1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((FLAGS.nb_classes, FLAGS.source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((FLAGS.nb_classes, FLAGS.source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (FLAGS.nb_classes, FLAGS.nb_classes, FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Define the SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, FLAGS.source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, FLAGS.source_samples)) # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_test[sample_ind])) target_classes = other_classes(FLAGS.nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( X_test[sample_ind:(sample_ind+1)], (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, FLAGS.nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params = {'theta': 1., 'gamma': 0.1, 'nb_classes': FLAGS.nb_classes, 'clip_min': 0., 'clip_max': 1., 'targets': y, 'y_val': one_hot_target} adv_x = jsma.generate_np(X_test[sample_ind:(sample_ind+1)], **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = X_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if FLAGS.viz_enabled: if 'figure' not in vars(): figure = pair_visual( np.reshape(X_test[sample_ind:(sample_ind+1)], (FLAGS.img_rows, FLAGS.img_cols)), np.reshape(adv_x, (FLAGS.img_rows, FLAGS.img_cols))) else: figure = pair_visual( np.reshape(X_test[sample_ind:(sample_ind+1)], (FLAGS.img_rows, FLAGS.img_cols)), np.reshape(adv_x, (FLAGS.img_rows, FLAGS.img_cols)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((FLAGS.nb_classes - 1) * FLAGS.source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if FLAGS.viz_enabled: _ = grid_visual(grid_viz_data)
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=10, learning_rate=0.001): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = make_basic_cnn() preds = model(x) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) model_train(sess, x, y, preds, X_train, Y_train, args=train_params, rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params) assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = {'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None} figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = X_test[sample_ind:(sample_ind+1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(Y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, channels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = X_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, channels)), np.reshape(adv_x, (img_rows, img_cols, channels)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, channels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def cifar10_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=VIZ_ENABLED, nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE, source_samples=SOURCE_SAMPLES, learning_rate=LEARNING_RATE, model_path=MODEL_PATH, noise_output=NOISE_OUTPUT): """ CIFAR10 tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get CIFAR10 test data cifar10 = CIFAR10(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) x_train, y_train = cifar10.get_set('train') x_test, y_test = cifar10.get_set('test') # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelAllConvolutional('model1', nb_classes, nb_filters, input_shape=[32, 32, 3]) preds = model.get_logits(x) loss = CrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an CIFAR10 model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'filename': os.path.split(model_path)[-1] } sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) train(sess, loss, x_train, y_train, args=train_params, rng=rng) # Evaluate the accuracy of the CIFAR10 model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, 1, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, sess=sess) jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } # Loop over the samples we want to perturb into adversarial examples adv_all = np.zeros((nb_classes, img_rows, img_cols, nchannels), dtype='f') sample_all = np.zeros((nb_classes, img_rows, img_cols, nchannels), dtype='f') for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = x_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal # grid_viz_data[current_class, current_class, :, :, :] = np.reshape( # sample, (img_rows, img_cols, nchannels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) adv_all[current_class] = adv_x sample_all[current_class] = sample # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = x_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side # if viz_enabled: # figure = pair_visual( # np.reshape(sample, (img_rows, img_cols, nchannels)), # np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure) # # Add our adversarial example to our grid data # grid_viz_data[target, current_class, :, :, :] = np.reshape( # adv_x, (img_rows, img_cols, nchannels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Compute the average distortion introduced by the algorithm l2_norm = np.mean(np.sum((adv_all - sample_all)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(l2_norm)) for i in range(nb_classes): if noise_output: image = adv_all[i] - sample_all[i] else: image = adv_all[i] grid_viz_data[i, 0] = image # Close TF session sess.close() def save_visual(data, path): """ Modified version of cleverhans.plot.pyplot """ import matplotlib.pyplot as plt figure = plt.figure() # figure.canvas.set_window_title('Cleverhans: Grid Visualization') # Add the images to the plot num_cols = data.shape[0] num_rows = data.shape[1] num_channels = data.shape[4] for y in range(num_rows): for x in range(num_cols): figure.add_subplot(num_rows, num_cols, (x + 1) + (y * num_cols)) plt.axis('off') if num_channels == 1: plt.imshow(data[x, y, :, :, 0], cmap='gray') else: plt.imshow(data[x, y, :, :, :]) # Draw the plot and return plt.savefig(path) # Finally, block & display a grid of all the adversarial examples if viz_enabled: if noise_output: image_name = "output/jsma_cifar10_noise.png" else: image_name = "output/jsma_cifar10.png" _ = save_visual(grid_viz_data, image_name) return report