def createAttack(model, sess, x, y, X_test, y_test, eps = 0.02):
    from cleverhans.attacks import MadryEtAl

    print("Beginning PGD attack")
    pgd = MadryEtAl(model, back='tf', sess=sess)
    preds = model(x)

    t0 = time.time()
    batch_size = 64

    # Incredibly horrible and ugly way to iterate over X_test.  Sorry.
    X_test_adv_pgd = np.zeros(X_test.shape)
    num_batches = X_test.shape[0] // batch_size
    for i in range(X_test.shape[0] // batch_size):
        batch_start = batch_size*i 
        batch_end = batch_size*(i+1)
        batch = X_test[batch_start:batch_end]
        if not (i % 20):
            print("attacking batch", i, "from ", batch_start, " to ", batch_end, file=sys.stderr)
        attack_target = 1 - y_test[batch_start:batch_end]
        pgd_params = {'eps': eps,
                  'eps_iter': 0.01,
                  'clip_min': -1.,
                  'clip_max': 1.,
                  'nb_iter': 20,
                  'y_target': attack_target}
        X_test_adv_pgd[batch_start:batch_end] = pgd.generate_np(batch, **pgd_params)
    if X_test.shape[0] % batch_size:
        batch_start = (num_batches * batch_size )
        batch_end = X_test.shape[0]
        batch = X_test[batch_start:batch_end].reshape((-1,224,224,3))
        print("attacking residual batch from ", batch_start, " to ", batch_end, file=sys.stderr)
        attack_target = 1 - y_test[batch_start:batch_end].reshape((-1,2))
        pgd_params = {'eps': eps,
                  'eps_iter': 0.01,
                  'clip_min': -1.,
                  'clip_max': 1.,
                  'nb_iter': 20,
                  'y_target': attack_target}
        X_test_adv_pgd[batch_start:batch_end] = pgd.generate_np(batch, **pgd_params)

    # Report on timing
    t1 = time.time()
    total = t1-t0
    m, s = divmod(total, 60)
    h, m = divmod(m, 60)
    print ("Completed attack in %d:%02d:%02d" % (h, m, s))
    
    return X_test_adv_pgd
示例#2
0
class TestMadryEtAl(CleverHansTest):
    def setUp(self):
        super(TestMadryEtAl, self).setUp()

        self.sess = tf.Session()
        self.model = SimpleModel()
        self.attack = MadryEtAl(self.model, sess=self.sess)

    def test_attack_strength(self):
        """
        If clipping is not done at each iteration (not using clip_min and
        clip_max), this attack fails by
        np.mean(orig_labels == new_labels) == .5
        """
        x_val = np.random.rand(100, 2)
        x_val = np.array(x_val, dtype=np.float32)

        x_adv = self.attack.generate_np(x_val,
                                        eps=1.0,
                                        eps_iter=0.05,
                                        clip_min=0.5,
                                        clip_max=0.7,
                                        nb_iter=5,
                                        sanity_checks=False)

        orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1)
        new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1)
        self.assertLess(np.mean(orig_labs == new_labs), 0.1)

    def test_clip_eta(self):
        x_val = np.random.rand(100, 2)
        x_val = np.array(x_val, dtype=np.float32)

        x_adv = self.attack.generate_np(x_val,
                                        eps=1.0,
                                        eps_iter=0.1,
                                        nb_iter=5)

        delta = np.max(np.abs(x_adv - x_val), axis=1)
        self.assertTrue(np.all(delta <= 1.))

    def test_generate_np_gives_clipped_adversarial_examples(self):
        x_val = np.random.rand(100, 2)
        x_val = np.array(x_val, dtype=np.float32)

        x_adv = self.attack.generate_np(x_val,
                                        eps=1.0,
                                        eps_iter=0.1,
                                        nb_iter=5,
                                        clip_min=-0.2,
                                        clip_max=0.3,
                                        sanity_checks=False)

        self.assertLess(-0.201, np.min(x_adv))
        self.assertLess(np.max(x_adv), .301)

    def test_multiple_initial_random_step(self):
        """
        This test generates multiple adversarial examples until an adversarial
        example is generated with a different label compared to the original
        label. This is the procedure suggested in Madry et al. (2017).

        This test will fail if an initial random step is not taken (error>0.5).
        """
        x_val = np.random.rand(100, 2)
        x_val = np.array(x_val, dtype=np.float32)

        orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1)
        new_labs_multi = orig_labs.copy()

        # Generate multiple adversarial examples
        for i in range(10):
            x_adv = self.attack.generate_np(x_val,
                                            eps=.5,
                                            eps_iter=0.05,
                                            clip_min=0.5,
                                            clip_max=0.7,
                                            nb_iter=2,
                                            sanity_checks=False)
            new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1)

            # Examples for which we have not found adversarial examples
            I = (orig_labs == new_labs_multi)
            new_labs_multi[I] = new_labs[I]

        self.assertLess(np.mean(orig_labs == new_labs_multi), 0.5)
示例#3
0
# #              'max_iterations': attack_iterations,
# #              'learning_rate': 0.1,
# #              'batch_size': n_attack,
# #              'initial_const': 10}
# cw = CarliniWagnerL2(wrap_clf, back='tf', sess=sess)
# adv = cw.generate_np(X_atk, **cw_params)

from cleverhans.attacks import MadryEtAl
pgd_params = {'eps': 0.3,
              'eps_iter': 0.01,
              'nb_iter': 40,
              'clip_min': 0.,
              'clip_max': 1.,
              'rand_init': True}
pgd = MadryEtAl(wrap_clf, sess=sess)
adv = pgd.generate_np(X_atk, **pgd_params)

# adv_x = cw.generate(x, **cw_params)
# preds_adv = clf(adv_x)
# acc = model_eval(sess, x, y, preds_adv, X_test[:n_attack],
#                  y_test[:n_attack], args={'batch_size': n_attack})
# print('Test accuracy on CW adversarial examples: %0.4f\n' % acc)

pred = clf.predict(adv)
# print(np.sum(np.argmax(pred, axis=1) != np.argmax(y_test[:n_attack], axis=1)))
# pred_orig = clf.predict(X_atk)
# print(np.sum(np.argmax(pred, axis=1) != np.argmax(pred_orig, axis=1)))
print(np.sum(np.argmax(pred, axis=1) == np.argmax(y_target, axis=1)))

# Save some images
import scipy.misc
示例#4
0
class TestMadryEtAl(CleverHansTest):
    def setUp(self):
        super(TestMadryEtAl, self).setUp()
        import tensorflow as tf

        # The world's simplest neural network
        def my_model(x):
            W1 = tf.constant([[1.5, .3], [-2, 0.3]], dtype=tf.float32)
            h1 = tf.nn.sigmoid(tf.matmul(x, W1))
            W2 = tf.constant([[-2.4, 1.2], [0.5, -2.3]], dtype=tf.float32)
            res = tf.matmul(h1, W2)
            return res

        self.sess = tf.Session()
        self.model = my_model
        self.attack = MadryEtAl(self.model, sess=self.sess)

    def test_attack_strength(self):
        """
        If clipping is not done at each iteration (not using clip_min and
        clip_max), this attack fails by
        np.mean(orig_labels == new_labels) == .5
        """
        x_val = np.random.rand(100, 2)
        x_val = np.array(x_val, dtype=np.float32)

        x_adv = self.attack.generate_np(x_val, eps=1.0, eps_iter=0.05,
                                        clip_min=0.5, clip_max=0.7,
                                        nb_iter=5)

        orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1)
        new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1)
        print(np.mean(orig_labs == new_labs))
        self.assertTrue(np.mean(orig_labs == new_labs) < 0.1)

    def test_clip_eta(self):
        x_val = np.random.rand(100, 2)
        x_val = np.array(x_val, dtype=np.float32)

        x_adv = self.attack.generate_np(x_val, eps=1.0, eps_iter=0.1,
                                        nb_iter=5)

        delta = np.max(np.abs(x_adv - x_val), axis=1)
        self.assertTrue(np.all(delta <= 1.))

    def test_generate_np_gives_clipped_adversarial_examples(self):
        x_val = np.random.rand(100, 2)
        x_val = np.array(x_val, dtype=np.float32)

        x_adv = self.attack.generate_np(x_val, eps=1.0, eps_iter=0.1,
                                        nb_iter=5,
                                        clip_min=-0.2, clip_max=0.3)

        self.assertTrue(-0.201 < np.min(x_adv))
        self.assertTrue(np.max(x_adv) < .301)

    def test_multiple_initial_random_step(self):
        """
        This test generates multiple adversarial examples until an adversarial
        example is generated with a different label compared to the original
        label. This is the procedure suggested in Madry et al. (2017).

        This test will fail if an initial random step is not taken (error>0.5).
        """
        x_val = np.random.rand(100, 2)
        x_val = np.array(x_val, dtype=np.float32)

        orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1)
        new_labs_multi = orig_labs.copy()

        # Generate multiple adversarial examples
        for i in range(10):
            x_adv = self.attack.generate_np(x_val, eps=.5, eps_iter=0.05,
                                            clip_min=0.5, clip_max=0.7,
                                            nb_iter=2)
            new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1)

            # Examples for which we have not found adversarial examples
            I = (orig_labs == new_labs_multi)
            new_labs_multi[I] = new_labs[I]

        self.assertTrue(np.mean(orig_labs == new_labs_multi) < 0.1)
示例#5
0
    # resize the input image and preprocess it
    image = image.resize(target)
    image = keras.preprocessing.image.img_to_array(image)
    image = np.expand_dims(image, axis=0)
    image = keras.applications.mobilenet.preprocess_input(image)
    # return the processed image
    return image


model = keras.models.load_model("./model.h5")
#model.summary()
src = Image.open("./trixi.png").resize(IMAGE_DIMS)
src = prepare_image(src)
score = model.predict(src)
print(score[0][TREE_FROG_IDX])
print(np.argmax(score))
attack = MadryEtAl(model, sess=sess)
attack_params = {
    'eps': 0.2,
    'nb_iter': 10,
    'eps_iter': 0.2,
    'y_target': np.expand_dims(np.eye(num_labels)[TREE_FROG_IDX], axis=0)
}
adv_x = attack.generate_np(src, **attack_params)
adversarial = adv_x.reshape((224, 224, 3))
img = keras.preprocessing.image.array_to_img(adversarial)
score = model.predict(adv_x)
print(score[0][TREE_FROG_IDX])
print(np.argmax(score))
img.save("./solution.png")
assert is_similar_img("./trixi.png", "./solution.png")
def train_mnist(model_dir,
                next_batch_fn,
                total_batches,
                train_mode,
                save_every=1000,
                print_every=100):
    x_input = tf.placeholder(tf.float32, (None, 28, 28, 1))
    y_input = tf.placeholder(tf.float32, [None, 10])

    model = mnist_convnet.Model()
    logits = model(x_input)

    loss = tf.nn.softmax_cross_entropy_with_logits(labels=y_input,
                                                   logits=logits)
    loss = tf.reduce_mean(loss)
    accuracy = tf.reduce_mean(
        tf.cast(tf.equal(tf.argmax(logits, axis=1), tf.argmax(y_input,
                                                              axis=1)),
                dtype=tf.float32))

    global_step = tf.contrib.framework.get_or_create_global_step()
    train_step = tf.train.AdamOptimizer(1e-4).minimize(loss,
                                                       global_step=global_step)

    saver = tf.train.Saver(max_to_keep=3)
    a = tf.summary.scalar('accuracy adv train', accuracy)
    b = tf.summary.scalar('xent adv train', loss)
    c = tf.summary.image('images adv train', x_input)
    adv_summaries = tf.summary.merge([a, b, c])

    a = tf.summary.scalar('accuracy nat train', accuracy)
    b = tf.summary.scalar('xent nat train', loss)
    c = tf.summary.image('images nat train', x_input)
    nat_summaries = tf.summary.merge([a, b, c])

    with tf.Session() as sess:
        attack = MadryEtAl(model, sess=sess)

        summary_writer = tf.summary.FileWriter(model_dir, sess.graph)
        sess.run(tf.global_variables_initializer())

        for batch_num in range(total_batches):
            x_batch, y_batch = next_batch_fn()
            x_batch = np.reshape(x_batch, (-1, 28, 28, 1))

            if train_mode == "adversarial" and batch_num > 1000:
                x_batch_adv = attack.generate_np(x_batch,
                                                 y=y_batch,
                                                 eps=.3,
                                                 nb_iter=40,
                                                 eps_iter=.01,
                                                 rand_init=True,
                                                 clip_min=0,
                                                 clip_max=1)

            else:
                x_batch_adv = x_batch

            nat_dict = {x_input: x_batch, y_input: y_batch}

            adv_dict = {x_input: x_batch_adv, y_input: y_batch}

            if batch_num % print_every == 0:
                a, l, s = sess.run((accuracy, loss, nat_summaries), nat_dict)
                summary_writer.add_summary(s, sess.run(global_step))
                print(batch_num, "Clean accuracy", a, "loss", l)
                if train_mode == "adversarial":
                    a, l, s = sess.run((accuracy, loss, adv_summaries),
                                       adv_dict)
                    summary_writer.add_summary(s, sess.run(global_step))
                    print(batch_num, "Adv accuracy", a, "loss", l)

            if batch_num % save_every == 0:
                saver.save(sess,
                           os.path.join(model_dir, "checkpoint"),
                           global_step=global_step)

            sess.run(train_step, nat_dict)
            sess.run(train_step, adv_dict)
def main(argv=None):
    """
    CIFAR10 CleverHans tutorial
    :return:
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # CIFAR10-specific dimensions
    img_rows = 32
    img_cols = 32
    channels = 3
    nb_classes = 10

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    sess = tf.Session()

    set_log_level(logging.DEBUG)

    # Get CIFAR10 test data
    X_train, Y_train, X_test, Y_test = data_cifar10()

    # Label smoothing
    assert Y_train.shape[1] == 10.

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels))

    y = tf.placeholder(tf.float32, shape=(None, 10))

    model_path = FLAGS.model_path
    nb_samples = FLAGS.nb_samples

    from cnn_models import make_basic_cnn
    model = make_basic_cnn('fp_',
                           input_shape=(None, img_rows, img_cols, channels),
                           nb_filters=FLAGS.nb_filters)

    preds = model(x)
    print("Defined TensorFlow model graph with %d parameters" % model.n_params)

    rng = np.random.RandomState([2017, 8, 30])

    def evaluate(eval_params):
        # Evaluate the model on legitimate test examples
        acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
        return acc

    model_load(sess, model_path)
    print('Restored model from %s' % model_path)
    eval_params = {'batch_size': FLAGS.batch_size}
    accuracy = evaluate(eval_params)
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))

    ###########################################################################
    # Build dataset to perturb
    ###########################################################################
    if FLAGS.targeted:
        from utils import build_targeted_dataset
        adv_inputs, true_labels, adv_ys = build_targeted_dataset(
            X_test, Y_test, np.arange(nb_samples), nb_classes, img_rows,
            img_cols, channels)
        att_batch_size = np.clip(nb_samples * (nb_classes - 1),
                                 a_max=MAX_BATCH_SIZE,
                                 a_min=1)
        nb_adv_per_sample = nb_classes - 1
        yname = "y_target"
    else:
        adv_inputs = X_test[:nb_samples]
        true_labels = Y_test[:nb_samples]
        att_batch_size = np.minimum(nb_samples, MAX_BATCH_SIZE)
        nb_adv_per_sample = 1
        adv_ys = None
        yname = "y"

    print('Crafting ' + str(nb_samples) + ' * ' + str(nb_adv_per_sample) +
          ' adversarial examples')
    print("This could take some time ...")

    if FLAGS.attack == 'pgd':
        from cleverhans.attacks import MadryEtAl
        attacker = MadryEtAl(model, sess=sess)
        attack_params = {
            'eps': FLAGS.eps / 255.,
            'eps_iter': EPS_ITER / 255.,
            'nb_iter': FLAGS.nb_iter,
            'ord': np.inf,
            'rand_init': True,
            'batch_size': att_batch_size
        }
    elif FLAGS.attack == 'cwl2':
        from cleverhans.attacks import CarliniWagnerL2
        attacker = CarliniWagnerL2(model, sess=sess)
        learning_rate = 0.1
        attack_params = {
            'binary_search_steps': 1,
            'max_iterations': FLAGS.nb_iter,
            'learning_rate': learning_rate,
            'initial_const': 10,
            'batch_size': att_batch_size
        }
    attack_params.update({
        'clip_min': 0.,
        'clip_max': 1.,
    })
    # yname: adv_ys})

    X_test_adv = attacker.generate_np(adv_inputs, **attack_params)

    if FLAGS.targeted:
        assert X_test_adv.shape[0] == nb_samples * \
            (nb_classes - 1), X_test_adv.shape
        # Evaluate the accuracy of the CIFAR10 model on adversarial
        # examples
        print("Evaluating targeted results")
        # adv_accuracy = model_eval(sess, x, y, preds, X_test_adv, true_labels,
        adv_accuracy = model_eval(sess,
                                  x,
                                  y,
                                  preds_adv,
                                  adv_inputs,
                                  true_labels,
                                  args=eval_params)
    else:
        # Evaluate the accuracy of the CIFAR10 model on adversarial
        # examples
        print("Evaluating un-targeted results")
        adv_accuracy = model_eval(sess,
                                  x,
                                  y,
                                  preds,
                                  X_test_adv,
                                  Y_test,
                                  args=eval_params)

    print('Test accuracy on adversarial examples %.4f' % adv_accuracy)

    # Compute the avg. distortion introduced by the attack
    diff = np.abs(X_test_adv - adv_inputs)

    percent_perturbed = np.mean(np.sum(diff, axis=(1, 2, 3)))
    print('Avg. L_1 norm of perturbations {0:.4f}'.format(percent_perturbed))

    norm = np.mean(np.sqrt(np.sum(np.square(diff), axis=(1, 2, 3))))
    print('Avg. L_2 norm of perturbations {0:.4f}'.format(norm))

    sess.close()
示例#8
0
class TestMadryEtAl(CleverHansTest):
    def setUp(self):
        super(TestMadryEtAl, self).setUp()

        self.sess = tf.Session()
        self.model = SimpleModel()
        self.attack = MadryEtAl(self.model, sess=self.sess)

    def test_attack_strength(self):
        """
        If clipping is not done at each iteration (not using clip_min and
        clip_max), this attack fails by
        np.mean(orig_labels == new_labels) == .5
        """
        x_val = np.random.rand(100, 2)
        x_val = np.array(x_val, dtype=np.float32)

        x_adv = self.attack.generate_np(x_val, eps=1.0, eps_iter=0.05,
                                        clip_min=0.5, clip_max=0.7,
                                        nb_iter=5)

        orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1)
        new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1)
        self.assertTrue(np.mean(orig_labs == new_labs) < 0.1)

    def test_clip_eta(self):
        x_val = np.random.rand(100, 2)
        x_val = np.array(x_val, dtype=np.float32)

        x_adv = self.attack.generate_np(x_val, eps=1.0, eps_iter=0.1,
                                        nb_iter=5)

        delta = np.max(np.abs(x_adv - x_val), axis=1)
        self.assertTrue(np.all(delta <= 1.))

    def test_generate_np_gives_clipped_adversarial_examples(self):
        x_val = np.random.rand(100, 2)
        x_val = np.array(x_val, dtype=np.float32)

        x_adv = self.attack.generate_np(x_val, eps=1.0, eps_iter=0.1,
                                        nb_iter=5,
                                        clip_min=-0.2, clip_max=0.3)

        self.assertTrue(-0.201 < np.min(x_adv))
        self.assertTrue(np.max(x_adv) < .301)

    def test_multiple_initial_random_step(self):
        """
        This test generates multiple adversarial examples until an adversarial
        example is generated with a different label compared to the original
        label. This is the procedure suggested in Madry et al. (2017).

        This test will fail if an initial random step is not taken (error>0.5).
        """
        x_val = np.random.rand(100, 2)
        x_val = np.array(x_val, dtype=np.float32)

        orig_labs = np.argmax(self.sess.run(self.model(x_val)), axis=1)
        new_labs_multi = orig_labs.copy()

        # Generate multiple adversarial examples
        for i in range(10):
            x_adv = self.attack.generate_np(x_val, eps=.5, eps_iter=0.05,
                                            clip_min=0.5, clip_max=0.7,
                                            nb_iter=2)
            new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1)

            # Examples for which we have not found adversarial examples
            I = (orig_labs == new_labs_multi)
            new_labs_multi[I] = new_labs[I]

        self.assertTrue(np.mean(orig_labs == new_labs_multi) < 0.1)