def fgm(self, x, labels, targeted=False): """ TensorFlow Eager implementation of the Fast Gradient Method. :param x: the input variable :param targeted: Is the attack targeted or untargeted? Untargeted, the default, will try to make the label incorrect. Targeted will instead try to move in the direction of being more like y. :return: a tensor for the adversarial example """ # Compute loss with tf.GradientTape() as tape: # input should be watched because it may be # combination of trainable and non-trainable variables tape.watch(x) loss_obj = LossCrossEntropy(self.model, smoothing=0.0) loss = loss_obj.fprop(x=x, y=labels) if targeted: loss = -loss # Define gradient of loss wrt input grad = tape.gradient(loss, x) optimal_perturbation = attacks.optimize_linear(grad, self.eps, self.ord) # Add perturbation to original example to obtain adversarial example adv_x = x + optimal_perturbation # If clipping is needed # reset all values outside of [clip_min, clip_max] if (self.clip_min is not None) and (self.clip_max is not None): adv_x = tf.clip_by_value(adv_x, self.clip_min, self.clip_max) return adv_x
def test_xe(self): loss = LossCrossEntropy(self.model, smoothing=0.) l = loss.fprop(self.x, self.y) with tf.Session() as sess: vl1 = sess.run(l, feed_dict={self.x: self.vx, self.y: self.vy}) vl2 = sess.run(l, feed_dict={self.x: self.vx, self.y: self.vy}) self.assertClose(vl1, [2.210599660, 1.53666997], atol=1e-6) self.assertClose(vl2, [2.210599660, 1.53666997], atol=1e-6)
def test_xe_smoothing(self): loss = LossCrossEntropy(self.model, smoothing=0.1) l = loss.fprop(self.x, self.y) with tf.Session() as sess: vl1 = sess.run(l, feed_dict={self.x: self.vx, self.y: self.vy}) vl2 = sess.run(l, feed_dict={self.x: self.vx, self.y: self.vy}) self.assertClose(vl1, [2.10587597, 1.47194624], atol=1e-6) self.assertClose(vl2, [2.10587597, 1.47194624], atol=1e-6)
def fgm(self, x, labels, targeted=False): """ TensorFlow Eager implementation of the Fast Gradient Method. :param x: the input variable :param targeted: Is the attack targeted or untargeted? Untargeted, the default, will try to make the label incorrect. Targeted will instead try to move in the direction of being more like y. :return: a tensor for the adversarial example """ # Compute loss with tf.GradientTape() as tape: loss_obj = LossCrossEntropy(self.model, smoothing=0.) loss = loss_obj.fprop(x=x, y=labels) if targeted: loss = -loss # Define gradient of loss wrt input grad = tape.gradient(loss, x) if self.ord == np.inf: # Take sign of gradient normalized_grad = tf.sign(grad) # The following line should not change the numerical results. # It applies only because `normalized_grad` is the output of # a `sign` op, which has zero derivative anyway. # It should not be applied for the other norms, where the # perturbation has a non-zero derivative. normalized_grad = tf.stop_gradient(normalized_grad) elif self.ord == 1: red_ind = list(xrange(1, len(x.get_shape()))) normalized_grad = grad / tf.reduce_sum( tf.abs(grad), reduction_indices=red_ind, keep_dims=True) elif self.ord == 2: red_ind = list(xrange(1, len(x.get_shape()))) square = tf.reduce_sum(tf.square(grad), reduction_indices=red_ind, keep_dims=True) normalized_grad = grad / tf.sqrt(square) else: raise NotImplementedError("Only L-inf, L1 and L2 norms are " "currently implemented.") # Multiply by constant epsilon scaled_grad = self.eps * normalized_grad # Add perturbation to original example to obtain adversarial example adv_x = x + scaled_grad # If clipping is needed # reset all values outside of [clip_min, clip_max] if (self.clip_min is not None) and (self.clip_max is not None): adv_x = tf.clip_by_value(adv_x, self.clip_min, self.clip_max) return adv_x
def prep_bbox(sess, x, y, X_train, Y_train, X_test, Y_test, nb_epochs, batch_size, learning_rate, rng, nb_classes=10, img_rows=28, img_cols=28, nchannels=1): """ Define and train a model that simulates the "remote" black-box oracle described in the original paper. :param sess: the TF session :param x: the input placeholder for MNIST :param y: the ouput placeholder for MNIST :param X_train: the training data for the oracle :param Y_train: the training labels for the oracle :param X_test: the testing data for the oracle :param Y_test: the testing labels for the oracle :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param rng: numpy.random.RandomState :return: """ # Define TF model graph (for the black-box model) nb_filters = 64 model = ModelBasicCNN('model1', nb_classes, nb_filters) loss = LossCrossEntropy(model, smoothing=0.1) predictions = model.get_logits(x) print("Defined TensorFlow model graph.") # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } train(sess, loss, x, y, X_train, Y_train, args=train_params, rng=rng) # Print out the accuracy on legitimate data eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params) print('Test accuracy of black-box on legitimate test ' 'examples: ' + str(accuracy)) return model, predictions, accuracy
def train_mnist_cnn(datadir, train_start, train_end, test_start, test_end, num_epochs, batch_size, learning_rate): X_train, Y_train, X_test, Y_test = data_mnist(datadir=datadir, train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) gpu_options = tf.GPUOptions(allow_growth=True) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) sess.run(tf.global_variables_initializer()) # Training and evaluating params. train_params = {"nb_epochs": num_epochs, "batch_size": batch_size, "learning_rate": learning_rate} eval_params = {"batch_size": batch_size} # Define the model. Model = mnist_cnn_model(input_shape=(None,) + X_train.shape[1:]) loss = LossCrossEntropy(Model, smoothing=0.1) saver = tf.train.Saver(max_to_keep=1) x = tf.placeholder(tf.float32, shape=(None,) + X_train.shape[1:]) y = tf.placeholder(tf.float32, shape=(None,) + Y_train.shape[1:]) preds_x = Model.get_probs(x) train(sess, loss, x, y, X_train, Y_train, args=train_params) saver.save(sess, "./runs/ckpt/mnist_cnn_attacked.ckpt") test_accuracy = model_eval(sess, x, y, preds_x, X_test, Y_test, args=eval_params) print("Test accuracy: %0.4f" % test_accuracy) sess.close()
def reset(self): with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE): self.sess = tf.get_default_session() # Define input TF placeholder self.x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) self.y = tf.placeholder(tf.float32, shape=(None, 10)) self.model = ModelBasicCNN('model1', 10, 64) self.preds = self.model.get_logits(self.x) self.loss = LossCrossEntropy(self.model, smoothing=0.1)
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64, num_threads=None, label_smoothing=0.1): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} fgsm_params = { 'eps': 0.3, 'clip_min': 0., 'clip_max': 1. } rng = np.random.RandomState([2017, 8, 30]) sess = tf.Session() def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) if clean_train: model = make_basic_picklable_cnn() preds = model.get_logits(x) assert len(model.get_params()) > 0 loss = LossCrossEntropy(model, smoothing=label_smoothing) def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) train(sess, loss, x, y, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) with sess.as_default(): save("clean_model.joblib", model) # Now that the model has been saved, you can evaluate it in a # separate process using `evaluate_pickled_model.py`. # You should get exactly the same result for both clean and # adversarial accuracy as you get within this program. # Calculate training error if testing: do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval') # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_logits(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True) # Calculate training error if testing: do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval') print('Repeating the process, using adversarial training') # Create a new model and train it to be robust to FastGradientMethod model2 = make_basic_picklable_cnn() fgsm2 = FastGradientMethod(model2, sess=sess) def attack(x): return fgsm2.generate(x, **fgsm_params) loss2 = LossCrossEntropy(model2, smoothing=label_smoothing, attack=attack) preds2 = model2.get_logits(x) adv_x2 = attack(x) if not backprop_through_attack: # For the fgsm attack used in this tutorial, the attack has zero # gradient so enabling this flag does not change the gradient. # For some other attacks, enabling this flag increases the cost of # training, but gives the defender the ability to anticipate how # the atacker will change their strategy in response to updates to # the defender's parameters. adv_x2 = tf.stop_gradient(adv_x2) preds2_adv = model2.get_logits(adv_x2) def evaluate2(): # Accuracy of adversarially trained model on legitimate test inputs do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False) # Accuracy of the adversarially trained model on adversarial examples do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True) # Perform and evaluate adversarial training train(sess, loss2, x, y, x_train, y_train, evaluate=evaluate2, args=train_params, rng=rng, var_list=model2.get_params()) with sess.as_default(): save("adv_model.joblib", model2) # Now that the model has been saved, you can evaluate it in a # separate process using `evaluate_pickled_model.py`. # You should get exactly the same result for both clean and # adversarial accuracy as you get within this program. # Calculate training errors if testing: do_eval(preds2, x_train, y_train, 'train_adv_train_clean_eval') do_eval(preds2_adv, x_train, y_train, 'train_adv_train_adv_eval') return report
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) nb_filters = 64 # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': os.path.join(*os.path.split(model_path)[:-1]), 'filename': os.path.split(model_path)[-1] } rng = np.random.RandomState([2017, 8, 30]) # check if we've trained before, and if we have, use that pre-trained model if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) else: train(sess, loss, x, y, x_train, y_train, args=train_params, save=os.path.exists("models"), rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, back='tf', sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array([[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] else: adv_inputs = x_test[:source_samples] adv_ys = None yname = "y" cw_params = { 'binary_search_steps': 1, yname: adv_ys, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': source_samples * nb_classes if targeted else source_samples, 'initial_const': 10 } adv = cw.generate_np(adv_inputs, **cw_params) eval_params = {'batch_size': np.minimum(nb_classes, source_samples)} if targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, y_test[ idxs], args=eval_params) else: adv_accuracy = 1 - \ model_eval(sess, x, y, preds, adv, y_test[ :source_samples], args=eval_params) if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') # Compute the number of adversarial examples that were successfully found print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report
def train_cifar10_classifier(model_name, nb_epochs, data_augmentation=False): rng = np.random.RandomState([2018, 8, 7]) if data_augmentation: datagen, (x_train, y_train), (x_test, y_test) = load_cifar10(augmented=True) x_t = x_train.copy() y_t = y_train.copy() datagen.fit(x_t) dataflow = datagen.flow(x_t, y_t, batch_size=50000) x_train, y_train = dataflow.next() else: (x_train, y_train), (x_test, y_test) = load_cifar10() x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3)) y = tf.placeholder(tf.float32, shape=(None, 10)) keep_prob = tf.placeholder(tf.float32, ()) is_training = tf.placeholder(tf.bool, ()) if model_name == 'simple': model = make_simple_cnn(keep_prob=keep_prob) train_params = { 'nb_epochs': nb_epochs, 'batch_size': 128, 'learning_rate': 1e-3} eval_params = {'batch_size': 128} elif model_name == 'resnet': model = make_resnet(is_training=is_training, depth=32) train_params = { 'nb_epochs': nb_epochs, 'batch_size': 32, 'learning_rate': 3e-4} eval_params = {'batch_size': 32} #assert len(model.get_params()) == len(tf.trainable_variables()) preds = model.get_probs(x) loss = LossCrossEntropy(model, 0) def evaluate(): acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params, feed={keep_prob: 1.0, is_training: False}) print('Test accuracy on legitimate examples: %0.4f' % acc) if data_augmentation: x_aug, y_aug = dataflow.next() x_train[...] = x_aug y_train[...] = y_aug config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) train(sess, loss, x, y, x_train, y_train, evaluate=evaluate, args=train_params, feed={keep_prob: 0.5, is_training: True}, rng=rng, var_list=model.get_params()) savedir = '../tfmodels' if not os.path.isdir(savedir): os.makedirs(savedir) saver = tf.train.Saver(var_list=tf.global_variables()) model_savename = 'cifar10_%s_model_epoch%d' % (model_name, nb_epochs) if data_augmentation: model_savename += '_aug' saver.save(sess, os.path.join(savedir, model_savename))
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, nb_classes=10, source_samples=10, learning_rate=0.001): """ MNIST tutorial for the Jacobian-based saliency map approach (JSMA) :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # MNIST-specific dimensions img_rows = 28 img_cols = 28 channels = 1 # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session and set as Keras backend session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Define TF model graph model = ModelBasicCNN('model1', 10, 64) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } sess.run(tf.global_variables_initializer()) rng = np.random.RandomState([2017, 8, 30]) train(sess, loss, x, y, x_train, y_train, args=train_params, rng=rng) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) assert x_test.shape[0] == test_end - test_start, x_test.shape print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using the Jacobian-based saliency map approach ########################################################################### print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) + ' adversarial examples') # Keep track of success (adversarial example classified in target) results = np.zeros((nb_classes, source_samples), dtype='i') # Rate of perturbed features for each test set example and target class perturbations = np.zeros((nb_classes, source_samples), dtype='f') # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels) grid_viz_data = np.zeros(grid_shape, dtype='f') # Instantiate a SaliencyMapMethod attack object jsma = SaliencyMapMethod(model, back='tf', sess=sess) jsma_params = { 'theta': 1., 'gamma': 0.1, 'clip_min': 0., 'clip_max': 1., 'y_target': None } figure = None # Loop over the samples we want to perturb into adversarial examples for sample_ind in xrange(0, source_samples): print('--------------------------------------') print('Attacking input %i/%i' % (sample_ind + 1, source_samples)) sample = x_test[sample_ind:(sample_ind + 1)] # We want to find an adversarial example for each possible target class # (i.e. all classes that differ from the label given in the dataset) current_class = int(np.argmax(y_test[sample_ind])) target_classes = other_classes(nb_classes, current_class) # For the grid visualization, keep original images along the diagonal grid_viz_data[current_class, current_class, :, :, :] = np.reshape( sample, (img_rows, img_cols, channels)) # Loop over all target classes for target in target_classes: print('Generating adv. example for target class %i' % target) # This call runs the Jacobian-based saliency map approach one_hot_target = np.zeros((1, nb_classes), dtype=np.float32) one_hot_target[0, target] = 1 jsma_params['y_target'] = one_hot_target adv_x = jsma.generate_np(sample, **jsma_params) # Check if success was achieved res = int(model_argmax(sess, x, preds, adv_x) == target) # Computer number of modified features adv_x_reshape = adv_x.reshape(-1) test_in_reshape = x_test[sample_ind].reshape(-1) nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0] percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0] # Display the original and adversarial images side-by-side if viz_enabled: figure = pair_visual( np.reshape(sample, (img_rows, img_cols, channels)), np.reshape(adv_x, (img_rows, img_cols, channels)), figure) # Add our adversarial example to our grid data grid_viz_data[target, current_class, :, :, :] = np.reshape( adv_x, (img_rows, img_cols, channels)) # Update the arrays for later analysis results[target, sample_ind] = res perturbations[target, sample_ind] = percent_perturb print('--------------------------------------') # Compute the number of adversarial examples that were successfully found nb_targets_tried = ((nb_classes - 1) * source_samples) succ_rate = float(np.sum(results)) / nb_targets_tried print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate)) report.clean_train_adv_eval = 1. - succ_rate # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean(perturbations) print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed)) # Compute the average distortion introduced for successful samples only percent_perturb_succ = np.mean(perturbations * (results == 1)) print('Avg. rate of perturbed features for successful ' 'adversarial examples {0:.4f}'.format(percent_perturb_succ)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt plt.close(figure) _ = grid_visual(grid_viz_data) return report
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, train_dir="train_dir", filename="mnist.ckpt", load_model=False, testing=False, label_smoothing=0.1): """ MNIST CleverHans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param train_dir: Directory storing the saved model :param filename: Filename to save model under :param load_model: True for load, False for not load :param testing: if true, test error is calculated :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ keras.layers.core.K.set_learning_phase(0) # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) if not hasattr(backend, "tf"): raise RuntimeError("This tutorial requires keras to be configured" " to use the TensorFlow backend.") if keras.backend.image_dim_ordering() != 'tf': keras.backend.set_image_dim_ordering('tf') print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to " "'th', temporarily setting to 'tf'") # Create TF session and set as Keras backend session sess = tf.Session() keras.backend.set_session(sess) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) preds = model(x) print("Defined TensorFlow model graph.") def evaluate(): # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params) report.clean_train_clean_eval = acc # assert X_test.shape[0] == test_end - test_start, X_test.shape print('Test accuracy on legitimate examples: %0.4f' % acc) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': train_dir, 'filename': filename } rng = np.random.RandomState([2017, 8, 30]) if not os.path.exists(train_dir): os.mkdir(train_dir) ckpt = tf.train.get_checkpoint_state(train_dir) print(train_dir, ckpt) ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path wrap = KerasModelWrapper(model) if load_model and ckpt_path: saver = tf.train.Saver() print(ckpt_path) saver.restore(sess, ckpt_path) print("Model loaded from: {}".format(ckpt_path)) evaluate() else: print("Model was not loaded, training from scratch.") loss = LossCrossEntropy(wrap, smoothing=label_smoothing) train(sess, loss, x, y, x_train, y_train, evaluate=evaluate, args=train_params, save=True, rng=rng) # Calculate training error if testing: eval_params = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds, x_train, y_train, args=eval_params) report.train_clean_train_clean_eval = acc # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph fgsm = FastGradientMethod(wrap, sess=sess) fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} adv_x = fgsm.generate(x, **fgsm_params) # Consider the attack to be constant adv_x = tf.stop_gradient(adv_x) preds_adv = model(adv_x) # Evaluate the accuracy of the MNIST model on adversarial examples eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_test, y_test, args=eval_par) print('Test accuracy on adversarial examples: %0.4f\n' % acc) report.clean_train_adv_eval = acc # Calculating train error if testing: eval_par = {'batch_size': batch_size} acc = model_eval(sess, x, y, preds_adv, x_train, y_train, args=eval_par) report.train_clean_train_adv_eval = acc print("Repeating the process, using adversarial training") # Redefine TF model graph model_2 = cnn_model(img_rows=img_rows, img_cols=img_cols, channels=nchannels, nb_filters=64, nb_classes=nb_classes) wrap_2 = KerasModelWrapper(model_2) preds_2 = model_2(x) fgsm2 = FastGradientMethod(wrap_2, sess=sess) def attack(x): return fgsm2.generate(x, **fgsm_params) preds_2_adv = model_2(attack(x)) loss_2 = LossCrossEntropy(wrap_2, smoothing=label_smoothing, attack=attack) def evaluate_2(): # Accuracy of adversarially trained model on legitimate test inputs eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, x_test, y_test, args=eval_params) print('Test accuracy on legitimate examples: %0.4f' % accuracy) report.adv_train_clean_eval = accuracy # Accuracy of the adversarially trained model on adversarial examples accuracy = model_eval(sess, x, y, preds_2_adv, x_test, y_test, args=eval_params) print('Test accuracy on adversarial examples: %0.4f' % accuracy) report.adv_train_adv_eval = accuracy # Perform and evaluate adversarial training train(sess, loss_2, x, y, x_train, y_train, evaluate=evaluate_2, args=train_params, save=False, rng=rng) # Calculate training errors if testing: eval_params = {'batch_size': batch_size} accuracy = model_eval(sess, x, y, preds_2, x_train, y_train, args=eval_params) report.train_adv_train_clean_eval = accuracy accuracy = model_eval(sess, x, y, preds_2_adv, x_train, y_train, args=eval_params) report.train_adv_train_adv_eval = accuracy return report
def train_sub(sess, x, y, bbox_preds, X_sub, Y_sub, nb_classes, nb_epochs_s, batch_size, learning_rate, data_aug, lmbda, aug_batch_size, rng, img_rows=28, img_cols=28, nchannels=1): """ This function creates the substitute by alternatively augmenting the training data and training the substitute. :param sess: TF session :param x: input TF placeholder :param y: output TF placeholder :param bbox_preds: output of black-box model predictions :param X_sub: initial substitute training data :param Y_sub: initial substitute training labels :param nb_classes: number of output classes :param nb_epochs_s: number of epochs to train substitute model :param batch_size: size of training batches :param learning_rate: learning rate for training :param data_aug: number of times substitute training data is augmented :param lmbda: lambda from arxiv.org/abs/1602.02697 :param rng: numpy.random.RandomState instance :return: """ # Define TF model graph (for the black-box model) model_sub = ModelSubstitute('model_s', nb_classes) preds_sub = model_sub.get_logits(x) loss_sub = LossCrossEntropy(model_sub, smoothing=0) print("Defined TensorFlow model graph for the substitute.") # Define the Jacobian symbolically using TensorFlow grads = jacobian_graph(preds_sub, x, nb_classes) # Train the substitute and augment dataset alternatively for rho in xrange(data_aug): print("Substitute training epoch #" + str(rho)) train_params = { 'nb_epochs': nb_epochs_s, 'batch_size': batch_size, 'learning_rate': learning_rate } with TemporaryLogLevel(logging.WARNING, "cleverhans.utils.tf"): train(sess, loss_sub, x, y, X_sub, to_categorical(Y_sub, nb_classes), init_all=False, args=train_params, rng=rng, var_list=model_sub.get_params()) # If we are not at last substitute training iteration, augment dataset if rho < data_aug - 1: print("Augmenting substitute training data.") # Perform the Jacobian augmentation lmbda_coef = 2 * int(int(rho / 3) != 0) - 1 X_sub = jacobian_augmentation(sess, x, X_sub, Y_sub, grads, lmbda_coef * lmbda, aug_batch_size) print("Labeling substitute training data.") # Label the newly generated synthetic points using the black-box Y_sub = np.hstack([Y_sub, Y_sub]) X_sub_prev = X_sub[int(len(X_sub) / 2):] eval_params = {'batch_size': batch_size} bbox_val = batch_eval(sess, [x], [bbox_preds], [X_sub_prev], args=eval_params)[0] # Note here that we take the argmax because the adversary # only has access to the label (not the probabilities) output # by the black-box model Y_sub[int(len(X_sub) / 2):] = np.argmax(bbox_val, axis=1) return model_sub, preds_sub
def mnist_tutorial(train_start=0, train_end=60000, test_start=0, test_end=10000, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64, num_threads=None, label_smoothing=0.1): """ MNIST cleverhans tutorial :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param learning_rate: learning rate for training :param clean_train: perform normal training on clean examples only before performing adversarial training. :param testing: if true, complete an AccuracyReport for unit tests to verify that performance is adequate :param backprop_through_attack: If True, backprop through adversarial example construction process during adversarial training. :param clean_train: if true, train on clean examples :param label_smoothing: float, amount of label smoothing for cross entropy :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} #sess = tf.Session(config=tf.ConfigProto(**config_args)) sess = tf.Session(config=tf.ConfigProto(device_count={'GPU': 1})) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(file, train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Use Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] rng = np.random.RandomState([2017, 8, 30]) ################ color training initialization #################### color_training_epochs = 5000 color_learning_rate = 0.1 colorCategory = [ [0.0, 0.4], # Black [0.3, 0.7], # Grey [0.6, 1.0] # White ] numOfPRModel = 20 minColorEpoch = 300 maxColorEpoch = 3000 numColorInput = 1 #numColorOutput = len(colorCategory) color_x = tf.placeholder( tf.float32, [None, numColorInput]) # mnist data image of shape 28*28=784 color_y = tf.placeholder( tf.float32, [None, numColorOutput]) # 0-9 digits recognition => 10 classes # Set multiple models' weights and biases color_W = {} color_b = {} color_pred_out = {} color_cost = {} color_optimizer = {} color_argmax = {} color_correct_prediction = {} color_accuracy = {} for i in range(numOfPRModel): color_W["w" + str(i)] = tf.Variable( tf.random_normal([numColorInput, numColorOutput])) color_b["b" + str(i)] = tf.Variable(tf.random_normal([numColorOutput])) color_pred_out["out" + str(i)] = tf.matmul( color_x, color_W["w" + str(i)]) + color_b["b" + str(i)] # Softmax color_cost["cost" + str(i)] = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=color_pred_out["out" + str(i)], labels=color_y)) # Gradient Descent color_optimizer["opt" + str(i)] = tf.train.GradientDescentOptimizer( color_learning_rate).minimize(color_cost["cost" + str(i)]) # Test model color_argmax["argmax" + str(i)] = tf.argmax( color_pred_out["out" + str(i)], 1) color_correct_prediction["pred" + str(i)] = tf.equal( tf.argmax(color_pred_out["out" + str(i)], 1), tf.argmax(color_y, 1)) # Calculate accuracy color_accuracy["acc" + str(i)] = tf.reduce_mean( tf.cast(color_correct_prediction["pred" + str(i)], tf.float32)) # Graph for re-generating the original image into a new image by using trained color model pr_model_x = tf.placeholder( tf.float32, [None, n_input, numColorInput]) # mnist data image of shape 28*28=784 pr_model_W = tf.placeholder(tf.float32, [None, numColorInput, numColorOutput ]) # mnist data image of shape 28*28=784 pr_model_b = tf.placeholder(tf.float32, [None, numColorInput, numColorOutput ]) # mnist data image of shape 28*28=784 pr_model_output = tf.one_hot( tf.argmax((tf.matmul(pr_model_x, pr_model_W) + pr_model_b), 2), numColorOutput) # Merge the random generated output for new image based on the colorCategory randomColorCategory = [] for i in range(len(colorCategory)): tmp = [] tmpRandomColorCategory = my_tf_round( tf.random_uniform(tf.shape(pr_model_x), colorCategory[i][0], colorCategory[i][1], dtype=tf.float32), 2) tmp.append(tmpRandomColorCategory) randomColorCategory.append(tf.concat(tmp, 1)) random_merge = tf.reshape(tf.concat(randomColorCategory, -1), [-1, n_input, numColorOutput]) random_color_set = tf.reduce_sum( tf.multiply(pr_model_output, random_merge), 2) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) # x = tf.reshape(random_color_set, shape=(-1, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) print(random_color_set) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, 'train_dir': save_dir, 'filename': filename, 'numColorOutput': numColorOutput } eval_params = {'batch_size': batch_size, 'numColorOutput': numColorOutput} fgsm_params = {'eps': 8 / 256, 'clip_min': 0., 'clip_max': 1.} #sess = tf.Session() def do_eval(preds, x_set, y_set, report_key, is_adv=None, pred2=None, c_w=None, c_b=None, pr_model_x=None, random_color_set=None, pr_model_W=None, pr_model_b=None, pr_model_output=None, ae=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params, pred2=pred2, c_w=c_w, c_b=c_b, pr_model_x=pr_model_x, random_color_set=random_color_set, pr_model_W=pr_model_W, pr_model_b=pr_model_b, pr_model_output=pr_model_output, is_adv=is_adv, ae=ae) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) with sess.as_default(): if hasattr(tf, "global_variables_initializer"): tf.global_variables_initializer().run() else: warnings.warn("Update your copy of tensorflow; future versions of " "CleverHans may drop support for this version.") sess.run(tf.initialize_all_variables()) ################# color training #################### print("Trying to load pr model from: " + model_path2) if os.path.exists(model_path2 + ".meta"): tf_model_load(sess, model_path2) c_w, c_b = sess.run([color_W, color_b]) print("Load color trained model in training") else: # Training the PR model c_w = {} c_b = {} for modelcount in range(numOfPRModel): color_training_epochs = np.random.randint( minColorEpoch, maxColorEpoch) for epoch in range(color_training_epochs): outputColorY = [] p1 = np.random.random(100) for i in range(len(p1)): outputOverlapColorY = [] for j in range(len(colorCategory)): if p1[i] >= colorCategory[j][0] and p1[ i] <= colorCategory[j][1]: colorIndexSeq = [] for k in range(len(colorCategory)): if j == k: colorIndexSeq.append(1) else: colorIndexSeq.append(0) outputOverlapColorY.append(colorIndexSeq) # break # Randomly choose the output for color Y if the outputOverlapColorY has more than 1 item outputColorY.append( outputOverlapColorY[np.random.randint( 0, len(outputOverlapColorY))]) inputColorX = p1.reshape(100, 1) _, c, _c_w, _c_b = sess.run([ color_optimizer["opt" + str(modelcount)], color_cost["cost" + str(modelcount)], color_W["w" + str(modelcount)], color_b["b" + str(modelcount)] ], feed_dict={ color_x: inputColorX, color_y: outputColorY }) avg_cost = c # Evaluating color model outputColorY = [] p1 = np.random.random(100) # Generate output for random color inputs (test case) for i in range(len(p1)): for j in range(len(colorCategory)): outputOverlapColorY = [] if p1[i] >= colorCategory[j][0] and p1[ i] <= colorCategory[j][1]: colorIndexSeq = [] for k in range(len(colorCategory)): if j == k: colorIndexSeq.append(1) else: colorIndexSeq.append(0) outputOverlapColorY.append(colorIndexSeq) break # Randomly choose the output for color Y if the outputOverlapColorY has more than 1 item outputColorY.append( outputOverlapColorY[np.random.randint( 0, len(outputOverlapColorY))]) inputColorX = p1.reshape(100, 1) # print(random_xs) acc, argmax = sess.run([ color_accuracy["acc" + str(modelcount)], color_argmax["argmax" + str(modelcount)] ], feed_dict={ color_x: inputColorX, color_y: outputColorY }) print(str(modelcount + 1) + ") Epoch:", '%04d' % (epoch + 1) + "/" + str(color_training_epochs) + ", Cost= " + \ "{:.9f}".format(avg_cost) + ", Training Accuracy= " + \ "{:.5f}".format(acc) + " ") c_w["w" + str(modelcount)] = _c_w c_b["b" + str(modelcount)] = _c_b # print(c_w) save_path = os.path.join(save_dir2, filename2) saver = tf.train.Saver() saver.save(sess, save_path) ##################### end of color training ------------------------------ ################# model training #################### if clean_train: model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=label_smoothing) # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph saveFileNum = 50 # saveFileNum = 500 # saveFileNum = 1000 model_path = os.path.join(save_dir, filename + "-" + str(saveFileNum)) fgsm = FastGradientMethod(model) # fgsm = BasicIterativeMethod(model) # fgsm = MomentumIterativeMethod(model) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_logits(adv_x) def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False, pred2=preds, c_w=c_w, c_b=c_b, pr_model_x=pr_model_x, random_color_set=random_color_set, pr_model_W=pr_model_W, pr_model_b=pr_model_b) #do_eval(preds, x_test, y_test, 'clean_train_adv_eval', True, #pred2=preds, c_w=c_w, c_b=c_b, ae=adv_x, #pr_model_x=pr_model_x, random_color_set=random_color_set, #pr_model_W=pr_model_W, pr_model_b=pr_model_b, pr_model_output=pr_model_output #) print("Trying to load trained model from: " + model_path) if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) print("Load trained model") else: train(sess, loss, x, y, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params(), save=True, c_w=c_w, c_b=c_b, pr_model_x=pr_model_x, random_color_set=random_color_set, pr_model_W=pr_model_W, pr_model_b=pr_model_b) # Calculate training error if testing: do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval') # Evaluate the accuracy of the MNIST model on adversarial examples do_eval(preds, x_test, y_test, 'clean_train_adv_eval', True, pred2=preds, c_w=c_w, c_b=c_b, ae=adv_x, pr_model_x=pr_model_x, random_color_set=random_color_set, pr_model_W=pr_model_W, pr_model_b=pr_model_b) # Calculate training error if testing: do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval')
def mnist_tutorial(train_start=0, train_end=1000, test_start=0, test_end=1666, nb_epochs=6, batch_size=128, learning_rate=0.001, clean_train=True, testing=False, backprop_through_attack=False, nb_filters=64, num_threads=None): # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Set logging level to see debug information set_log_level(logging.DEBUG) # Create TF session if num_threads: config_args = dict(intra_op_parallelism_threads=1) else: config_args = {} sess = tf.Session(config=tf.ConfigProto(**config_args)) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1)) y = tf.placeholder(tf.float32, shape=(None, 10)) # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate } eval_params = {'batch_size': batch_size} fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.} rng = np.random.RandomState([2017, 8, 30]) sess = tf.Session() def do_eval(preds, x_set, y_set, report_key, is_adv=None): acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params) setattr(report, report_key, acc) if is_adv is None: report_text = None elif is_adv: report_text = 'adversarial' # added by hhkim # print('cur:', y_set) # feed_dict = {x: x_set} # probabilities = sess.run(preds, feed_dict) # print(probabilities) else: report_text = 'legitimate' if report_text: print('Test accuracy on %s examples: %0.4f' % (report_text, acc)) if clean_train: model = ModelBasicCNN('model1', 10, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=0.1) def evaluate(): do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False) train(sess, loss, x, y, x_train, y_train, evaluate=evaluate, args=train_params, rng=rng, var_list=model.get_params()) # Calculate training error if testing: do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval') # Initialize the Fast Gradient Sign Method (FGSM) attack object and # graph fgsm = FastGradientMethod(model, sess=sess) adv_x = fgsm.generate(x, **fgsm_params) preds_adv = model.get_logits(adv_x) print('adv_x shape:', adv_x.shape) # Get array of output # updated by hak hyun kim feed_dict = {x: x_test[:1]} probabilities = sess.run(preds_adv, feed_dict) print(probabilities) print('original answer :', y_test[:1]) # Evaluate the accuracy of the MNIST model on adversarial examples do_eval(preds_adv, x_test[:1], y_test[:1], 'clean_train_adv_eval', True) # Calculate training error if testing: do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval') print('Repeating the process, using adversarial training')
def train(model, X_train=None, Y_train=None, save=False, predictions_adv=None, evaluate=None, args=None, rng=None, var_list=None, attack=None, attack_args=None): """ Train a TF Eager model :param model: instance of cleverhans model, takes in input batch, gives out probs(softmax layer). :param X_train: numpy array with training inputs :param Y_train: numpy array with training outputs :param save: boolean controlling the save operation :param predictions_adv: if set with the adversarial example tensor, will run adversarial training :param evaluate: function that is run after each training iteration (typically to display the test/validation accuracy). :param args: dict or argparse `Namespace` object. Should contain `nb_epochs`, `learning_rate`, `batch_size` If save is True, should also contain 'train_dir' and 'filename' :param rng: Instance of numpy.random.RandomState :param var_list: List of variables to train. :param attack: Instance of the class cleverhans.attacks.attacks_eager :param attack_args: Parameters required for the attack. :return: True if model trained """ args = _ArgsWrapper(args or {}) if ((attack is None) != (attack_args is None)): raise ValueError("attack and attack_args must be " "passed together.") if X_train is None or Y_train is None: raise ValueError("X_train argument and Y_train argument " "must be supplied.") # Check that necessary arguments were given (see doc above) assert args.nb_epochs, "Number of epochs was not given in args dict" assert args.learning_rate, "Learning rate was not given in args dict" assert args.batch_size, "Batch size was not given in args dict" if save: assert args.train_dir, "Directory for save was not given in args dict" assert args.filename, "Filename for save was not given in args dict" if rng is None: rng = np.random.RandomState() # Optimizer tfe = tf.contrib.eager optimizer = tf.train.AdamOptimizer(learning_rate=args.learning_rate) batch_x = tfe.Variable(X_train[0:args.batch_size], dtype=tf.float32) batch_y = tfe.Variable(Y_train[0:args.batch_size], dtype=tf.float32) # One epoch of training. for epoch in xrange(args.nb_epochs): # Compute number of batches nb_batches = int(math.ceil(float(len(X_train)) / args.batch_size)) assert nb_batches * args.batch_size >= len(X_train) # Indices to shuffle training set index_shuf = list(range(len(X_train))) rng.shuffle(index_shuf) prev = time.time() for batch in range(nb_batches): # Compute batch start and end indices start, end = batch_indices(batch, len(X_train), args.batch_size) # Perform one training step tf.assign(batch_x, X_train[index_shuf[start:end]]) tf.assign(batch_y, Y_train[index_shuf[start:end]]) # Compute grads with tf.GradientTape() as tape: # Define loss loss_clean_obj = LossCrossEntropy(model, smoothing=0.) loss_clean = loss_clean_obj.fprop(x=batch_x, y=batch_y) loss = loss_clean # Adversarial training if attack is not None: batch_adv_x = attack.generate(batch_x, **attack_args) loss_adv_obj = LossCrossEntropy(model, smoothing=0.) loss_adv = loss_adv_obj.fprop(x=batch_adv_x, y=batch_y) loss = (loss_clean + loss_adv) / 2.0 # Apply grads model_variables = model.get_params() grads = tape.gradient(loss, model_variables) optimizer.apply_gradients(zip(grads, model_variables)) assert end >= len(X_train) # Check that all examples were used cur = time.time() _logger.info("Epoch " + str(epoch) + " took " + str(cur - prev) + " seconds") if evaluate is not None: evaluate() if save: save_path = os.path.join(args.train_dir, args.filename) saver = tf.train.Saver() saver.save(save_path, model_variables) _logger.info("Completed model training and saved at: " + str(save_path)) else: _logger.info("Completed model training.") return True
def mnist_tutorial_cw(train_start=0, train_end=60000, test_start=0, test_end=10000, viz_enabled=True, nb_epochs=6, batch_size=128, source_samples=10, learning_rate=0.001, attack_iterations=100, model_path=os.path.join("models", "mnist"), targeted=True): """ MNIST tutorial for Carlini and Wagner's attack :param train_start: index of first training set example :param train_end: index of last training set example :param test_start: index of first test set example :param test_end: index of last test set example :param viz_enabled: (boolean) activate plots of adversarial examples :param nb_epochs: number of epochs to train model :param batch_size: size of training batches :param nb_classes: number of output classes :param source_samples: number of test inputs to attack :param learning_rate: learning rate for training :param model_path: path to the model file :param targeted: should we run a targeted attack? or untargeted? :return: an AccuracyReport object """ # Object used to keep track of (and return) key accuracies report = AccuracyReport() # Set TF random seed to improve reproducibility tf.set_random_seed(1234) # Create TF session sess = tf.Session() print("Created TensorFlow session.") set_log_level(logging.DEBUG) # Get MNIST test data x_train, y_train, x_test, y_test = data_mnist(file, train_start=train_start, train_end=train_end, test_start=test_start, test_end=test_end) # Obtain Image Parameters img_rows, img_cols, nchannels = x_train.shape[1:4] nb_classes = y_train.shape[1] nb_filters = 64 ################ color training initialization #################### color_training_epochs = 5000 color_learning_rate = 0.1 colorCategory = [ [0.0, 0.4], # Black [0.3, 0.7], # Grey [0.6, 1.0] # White ] numColorInput = 1 color_x = tf.placeholder( tf.float32, [None, numColorInput]) # mnist data image of shape 28*28=784 color_y = tf.placeholder( tf.float32, [None, numColorOutput]) # 0-9 digits recognition => 10 classes # Set model weights color_W = tf.Variable(tf.zeros([numColorInput, numColorOutput])) color_b = tf.Variable(tf.zeros([numColorOutput])) color_pred_out = tf.matmul(color_x, color_W) + color_b # Softmax color_cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=color_pred_out, labels=color_y)) # Gradient Descent color_optimizer = tf.train.GradientDescentOptimizer( color_learning_rate).minimize(color_cost) # Test model color_argmax = tf.argmax(color_pred_out, 1) color_correct_prediction = tf.equal(tf.argmax(color_pred_out, 1), tf.argmax(color_y, 1)) # Calculate accuracy color_accuracy = tf.reduce_mean( tf.cast(color_correct_prediction, tf.float32)) # Graph for re-generating the original image into a new image by using trained color model pr_model_x = tf.placeholder( tf.float32, [None, n_input, numColorInput]) # mnist data image of shape 28*28=784 pr_model_W = tf.placeholder(tf.float32, [None, numColorInput, numColorOutput ]) # mnist data image of shape 28*28=784 pr_model_b = tf.placeholder(tf.float32, [None, numColorInput, numColorOutput ]) # mnist data image of shape 28*28=784 pr_model_output = tf.one_hot( tf.argmax((tf.matmul(pr_model_x, pr_model_W) + pr_model_b), 2), numColorOutput) # Merge the random generated output for new image based on the colorCategory randomColorCategory = [] for i in range(len(colorCategory)): tmp = [] tmpRandomColorCategory = my_tf_round( tf.random_uniform(tf.shape(pr_model_x), colorCategory[i][0], colorCategory[i][1], dtype=tf.float32), 2) tmp.append(tmpRandomColorCategory) randomColorCategory.append(tf.concat(tmp, 1)) random_merge = tf.reshape(tf.concat(randomColorCategory, -1), [-1, n_input, numColorOutput]) random_color_set = tf.reduce_sum( tf.multiply(pr_model_output, random_merge), 2) # Define input TF placeholder x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels)) x = tf.reshape(random_color_set, shape=(-1, img_rows, img_cols, nchannels)) y = tf.placeholder(tf.float32, shape=(None, nb_classes)) # Define TF model graph model = ModelBasicCNN('model1', nb_classes, nb_filters) preds = model.get_logits(x) loss = LossCrossEntropy(model, smoothing=0.1) print("Defined TensorFlow model graph.") ########################################################################### # Training the model using TensorFlow ########################################################################### # Train an MNIST model train_params = { 'nb_epochs': nb_epochs, 'batch_size': batch_size, 'learning_rate': learning_rate, #'train_dir': os.path.join(*os.path.split(model_path)[:-1]), #'filename': os.path.split(model_path)[-1], 'train_dir': save_dir, 'filename': filename, 'numColorOutput': numColorOutput } with sess.as_default(): if hasattr(tf, "global_variables_initializer"): tf.global_variables_initializer().run() else: warnings.warn("Update your copy of tensorflow; future versions of " "CleverHans may drop support for this version.") sess.run(tf.initialize_all_variables()) ################# color training #################### print("Trying to load pr model from: " + model_path2) if os.path.exists(model_path2 + ".meta"): tf_model_load(sess, model_path2) c_w, c_b = sess.run([color_W, color_b]) print("Load color trained model in training") else: # Training the color for epoch in range(color_training_epochs): outputColorY = [] p1 = np.random.random(100) for i in range(len(p1)): outputOverlapColorY = [] for j in range(len(colorCategory)): if p1[i] >= colorCategory[j][0] and p1[ i] <= colorCategory[j][1]: colorIndexSeq = [] for k in range(len(colorCategory)): if j == k: colorIndexSeq.append(1) else: colorIndexSeq.append(0) outputOverlapColorY.append(colorIndexSeq) #break # Randomly choose the output for color Y if the outputOverlapColorY has more than 1 item outputColorY.append(outputOverlapColorY[np.random.randint( 0, len(outputOverlapColorY))]) inputColorX = p1.reshape(100, 1) _, c, c_w, c_b = sess.run( [color_optimizer, color_cost, color_W, color_b], feed_dict={ color_x: inputColorX, color_y: outputColorY }) avg_cost = c # Evaluating color model outputColorY = [] p1 = np.random.random(100) # Generate output for random color inputs (test case) for i in range(len(p1)): for j in range(len(colorCategory)): outputOverlapColorY = [] if p1[i] >= colorCategory[j][0] and p1[ i] <= colorCategory[j][1]: colorIndexSeq = [] for k in range(len(colorCategory)): if j == k: colorIndexSeq.append(1) else: colorIndexSeq.append(0) outputOverlapColorY.append(colorIndexSeq) break # Randomly choose the output for color Y if the outputOverlapColorY has more than 1 item outputColorY.append(outputOverlapColorY[np.random.randint( 0, len(outputOverlapColorY))]) inputColorX = p1.reshape(100, 1) # print(random_xs) acc, argmax = sess.run([color_accuracy, color_argmax], feed_dict={ color_x: inputColorX, color_y: outputColorY }) print("Epoch:", '%04d' % (epoch + 1) + "/" + str(color_training_epochs) + ", Cost= " + \ "{:.9f}".format(avg_cost) + ", Training Accuracy= " + \ "{:.5f}".format(acc) + " ") # print(c_w) with tf.device('/CPU:0'): saver = tf.train.Saver(tf.global_variables(), max_to_keep=50) # Since training PR model is fast, we do not have to save multiple sessions for this save_path = os.path.join(save_dir2, filename2) saver.save(sess, save_path) ##################### end of color training ------------------------------ ################# model training #################### rng = np.random.RandomState([2017, 8, 30]) saveFileNum = 50 saveFileNum = 500 # saveFileNum = 1000 model_path = os.path.join(save_dir, filename + "-" + str(saveFileNum)) # check if we've trained before, and if we have, use that pre-trained model print("Trying to load trained model from: " + model_path) if os.path.exists(model_path + ".meta"): tf_model_load(sess, model_path) print("Load trained model") else: train(sess, loss, x, y, x_train, y_train, args=train_params, rng=rng, save=True, c_w=c_w, c_b=c_b, pr_model_x=pr_model_x, random_color_set=random_color_set, pr_model_W=pr_model_W, pr_model_b=pr_model_b) # Evaluate the accuracy of the MNIST model on legitimate test examples eval_params = {'batch_size': batch_size, 'numColorOutput': numColorOutput} #accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params, # pred2=preds, c_w=c_w, c_b=c_b, pr_model_x=pr_model_x, random_color_set=random_color_set, # pr_model_W=pr_model_W, pr_model_b=pr_model_b) #assert x_test.shape[0] == test_end - test_start, x_test.shape #print('Test accuracy on legitimate test examples: {0}'.format(accuracy)) #report.clean_train_clean_eval = accuracy ########################################################################### # Craft adversarial examples using Carlini and Wagner's approach ########################################################################### nb_adv_per_sample = str(nb_classes - 1) if targeted else '1' print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample + ' adversarial examples') print("This could take some time ...") # Instantiate a CW attack object cw = CarliniWagnerL2(model, back='tf', sess=sess) if viz_enabled: assert source_samples == nb_classes idxs = [ np.where(np.argmax(y_test, axis=1) == i)[0][0] for i in range(nb_classes) ] if targeted: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = np.array([[instance] * nb_classes for instance in x_test[idxs]], dtype=np.float32) else: adv_inputs = np.array([[instance] * nb_classes for instance in x_test[:source_samples]], dtype=np.float32) one_hot = np.zeros((nb_classes, nb_classes)) one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1 adv_inputs = adv_inputs.reshape( (source_samples * nb_classes, img_rows, img_cols, nchannels)) adv_ys = np.array([one_hot] * source_samples, dtype=np.float32).reshape( (source_samples * nb_classes, nb_classes)) yname = "y_target" else: if viz_enabled: # Initialize our array for grid visualization grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels) grid_viz_data = np.zeros(grid_shape, dtype='f') adv_inputs = x_test[idxs] adv_inputs = x_test else: adv_inputs = x_test[:source_samples] adv_inputs = x_test adv_ys = None yname = "y" cw_params = { 'binary_search_steps': 1, 'max_iterations': attack_iterations, 'learning_rate': 0.1, 'batch_size': source_samples * nb_classes if targeted else source_samples, 'initial_const': 10 } adv2 = cw.generate(x, **cw_params) cw_params[yname] = adv_ys adv = None adv = cw.generate_np(adv_inputs, **cw_params) eval_params = { 'batch_size': np.minimum(nb_classes, source_samples), 'numColorOutput': numColorOutput } if targeted: adv_accuracy = model_eval(sess, x, y, preds, adv, adv_ys, args=eval_params) else: if viz_enabled: adv_accuracy = model_eval(sess, x, y, preds, adv, y_test[idxs], args=eval_params) else: #adv_accuracy = model_eval(sess, x, y, preds, adv, y_test[ # :source_samples], args=eval_params) adv_accuracy = model_eval(sess, x, y, preds, adv, y_test, args=eval_params, pred2=preds, c_w=c_w, c_b=c_b, pr_model_x=pr_model_x, random_color_set=random_color_set, pr_model_W=pr_model_W, pr_model_b=pr_model_b, is_adv=True, ae=adv2) if viz_enabled: for j in range(nb_classes): if targeted: for i in range(nb_classes): grid_viz_data[i, j] = adv[i * nb_classes + j] else: grid_viz_data[j, 0] = adv_inputs[j] grid_viz_data[j, 1] = adv[j] print(grid_viz_data.shape) print('--------------------------------------') print("load save file: ", saveFileNum) # Compute the number of adversarial examples that were successfully found print('Test with adv. examples {0:.4f}'.format(adv_accuracy)) report.clean_train_adv_eval = 1. - adv_accuracy # Compute the average distortion introduced by the algorithm percent_perturbed = np.mean( np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5) print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed)) # Close TF session sess.close() # Finally, block & display a grid of all the adversarial examples if viz_enabled: import matplotlib.pyplot as plt _ = grid_visual(grid_viz_data) return report