示例#1
0
    def __init__(self,
                 model,
                 image_shape_hwc,
                 epsilon=(16. / 255),
                 num_steps=200,
                 batch_size=32,
                 is_debug=False):
        self.graph = tf.Graph()

        with self.graph.as_default():
            self.sess = tf.Session(graph=self.graph)

            self.x_input = tf.placeholder(tf.float32,
                                          shape=(1, ) + image_shape_hwc)
            self.y_label = tf.placeholder(tf.int32, shape=(1, ))

            self.model = model
            attack = SPSA(CleverhansPyfuncModelWrapper(self.model),
                          sess=self.sess)
            self.x_adv = attack.generate(self.x_input,
                                         y=self.y_label,
                                         epsilon=epsilon,
                                         num_steps=num_steps,
                                         early_stop_loss_threshold=-1.,
                                         batch_size=batch_size,
                                         is_debug=is_debug)

        self.graph.finalize()
    def test_attack_success(self):
        """Check SPSA creates misclassified images."""
        epsilon = 4. / 255
        input_dir = FLAGS.input_image_dir
        metadata_file_path = FLAGS.metadata_file_path
        num_images = 8
        batch_shape = (num_images, 299, 299, 3)
        images, labels = load_images(input_dir, metadata_file_path,
                                     batch_shape)
        num_classes = 1001

        tf.logging.set_verbosity(tf.logging.INFO)
        with tf.Graph().as_default():
            # Prepare graph
            x_input = tf.placeholder(tf.float32, shape=(1, ) + batch_shape[1:])
            y_label = tf.placeholder(tf.int32, shape=(1, ))
            model = InceptionModel(num_classes)

            attack = SPSA(model)
            x_adv = attack.generate(x_input,
                                    y=y_label,
                                    epsilon=epsilon,
                                    num_steps=30,
                                    early_stop_loss_threshold=-1.,
                                    spsa_samples=32,
                                    spsa_iters=16,
                                    is_debug=True)

            logits = model.get_logits(x_adv)
            acc = _top_1_accuracy(logits, y_label)

            # Run computation
            saver = tf.train.Saver(slim.get_model_variables())
            session_creator = tf.train.ChiefSessionCreator(
                scaffold=tf.train.Scaffold(saver=saver),
                checkpoint_filename_with_path=FLAGS.checkpoint_path,
                master=FLAGS.master)

            num_correct = 0.
            with tf.train.MonitoredSession(
                    session_creator=session_creator) as sess:
                for i in xrange(num_images):
                    acc_val = sess.run(acc,
                                       feed_dict={
                                           x_input:
                                           np.expand_dims(images[i], axis=0),
                                           y_label:
                                           np.expand_dims(labels[i], axis=0),
                                       })
                    tf.logging.info('Accuracy: %s', acc_val)
                    num_correct += acc_val
                assert (num_correct / num_images) < 0.1
示例#3
0
    def test_attack_bounds(self):
        """Check SPSA respects perturbation limits."""
        epsilon = 4. / 255
        input_dir = FLAGS.input_image_dir
        metadata_file_path = FLAGS.metadata_file_path
        num_images = 8
        batch_shape = (num_images, 299, 299, 3)
        images, labels = load_images(input_dir, metadata_file_path,
                                     batch_shape)
        nb_classes = 1001

        tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
        with tf.Graph().as_default():
            # Prepare graph
            x_input = tf.compat.v1.placeholder(tf.float32,
                                               shape=(1, ) + batch_shape[1:])
            y_label = tf.compat.v1.placeholder(tf.int32, shape=(1, ))
            model = InceptionModel(nb_classes)

            attack = SPSA(model)
            x_adv = attack.generate(x_input,
                                    y=y_label,
                                    epsilon=epsilon,
                                    num_steps=10,
                                    early_stop_loss_threshold=-1.,
                                    spsa_samples=32,
                                    spsa_iters=1,
                                    is_debug=True)

            # Run computation
            saver = tf.compat.v1.train.Saver(slim.get_model_variables())
            session_creator = tf.compat.v1.train.ChiefSessionCreator(
                scaffold=tf.compat.v1.train.Scaffold(saver=saver),
                checkpoint_filename_with_path=FLAGS.checkpoint_path,
                master=FLAGS.master)

            with tf.compat.v1.train.MonitoredSession(
                    session_creator=session_creator) as sess:
                for i in xrange(num_images):
                    x_expanded = np.expand_dims(images[i], axis=0)
                    y_expanded = np.expand_dims(labels[i], axis=0)

                    adv_image = sess.run(x_adv,
                                         feed_dict={
                                             x_input: x_expanded,
                                             y_label: y_expanded
                                         })
                    diff = adv_image - images[i]
                    assert np.max(np.abs(diff)) < epsilon + 1e-4
                    assert np.max(adv_image < 1. + 1e-4)
                    assert np.min(adv_image > -1e-4)
示例#4
0
class TestSPSA(CleverHansTest):
    def setUp(self):
        super(TestSPSA, self).setUp()

        self.sess = tf.Session()
        self.model = SimpleModel()
        self.attack = SPSA(self.model, sess=self.sess)

    def test_attack_strength(self):
        # This uses the existing input structure for SPSA. Tom tried for ~40
        # minutes to get generate_np to work correctly but could not.

        n_samples = 10
        x_val = np.random.rand(n_samples, 2)
        x_val = np.array(x_val, dtype=np.float32)

        # The SPSA attack currently uses non-one-hot labels
        # TODO: change this to use standard cleverhans label conventions
        feed_labs = np.random.randint(0, 2, n_samples)

        x_input = tf.placeholder(tf.float32, shape=(1, 2))
        y_label = tf.placeholder(tf.int32, shape=(1, ))

        x_adv_op = self.attack.generate(
            x_input,
            y=y_label,
            epsilon=.5,
            num_steps=100,
            batch_size=64,
            spsa_iters=1,
        )

        all_x_adv = []
        for i in range(n_samples):
            x_adv_np = self.sess.run(x_adv_op,
                                     feed_dict={
                                         x_input:
                                         np.expand_dims(x_val[i], axis=0),
                                         y_label:
                                         np.expand_dims(feed_labs[i], axis=0),
                                     })
            all_x_adv.append(x_adv_np[0])

        x_adv = np.vstack(all_x_adv)
        new_labs = np.argmax(self.sess.run(self.model(x_adv)), axis=1)
        self.assertTrue(np.mean(feed_labs == new_labs) < 0.1)
    def test_attack_success(self):
        """Check SPSA creates misclassified images."""
        epsilon = 4. / 255
        input_dir = FLAGS.input_image_dir
        metadata_file_path = FLAGS.metadata_file_path
        num_images = 8
        batch_shape = (num_images, 299, 299, 3)
        images, labels = load_images(
            input_dir, metadata_file_path, batch_shape)
        num_classes = 1001

        tf.logging.set_verbosity(tf.logging.INFO)
        with tf.Graph().as_default():
            # Prepare graph
            x_input = tf.placeholder(tf.float32, shape=(1,) + batch_shape[1:])
            y_label = tf.placeholder(tf.int32, shape=(1,))
            model = InceptionModel(num_classes)

            attack = SPSA(model)
            x_adv = attack.generate(
                x_input, y=y_label, epsilon=epsilon, num_steps=30,
                early_stop_loss_threshold=-1., batch_size=32, spsa_iters=16,
                is_debug=True)

            logits = model.get_logits(x_adv)
            acc = _top_1_accuracy(logits, y_label)

            # Run computation
            saver = tf.train.Saver(slim.get_model_variables())
            session_creator = tf.train.ChiefSessionCreator(
                scaffold=tf.train.Scaffold(saver=saver),
                checkpoint_filename_with_path=FLAGS.checkpoint_path,
                master=FLAGS.master)

            num_correct = 0.
            with tf.train.MonitoredSession(
                    session_creator=session_creator) as sess:
                for i in xrange(num_images):
                    acc_val = sess.run(acc, feed_dict={
                        x_input: np.expand_dims(images[i], axis=0),
                        y_label: np.expand_dims(labels[i], axis=0),
                    })
                    tf.logging.info('Accuracy: %s', acc_val)
                    num_correct += acc_val
                assert (num_correct / num_images) < 0.1
    def test_attack_bounds(self):
        """Check SPSA respects perturbation limits."""
        epsilon = 4. / 255
        input_dir = FLAGS.input_image_dir
        metadata_file_path = FLAGS.metadata_file_path
        num_images = 8
        batch_shape = (num_images, 299, 299, 3)
        images, labels = load_images(
            input_dir, metadata_file_path, batch_shape)
        num_classes = 1001

        tf.logging.set_verbosity(tf.logging.INFO)
        with tf.Graph().as_default():
            # Prepare graph
            x_input = tf.placeholder(tf.float32, shape=(1,) + batch_shape[1:])
            y_label = tf.placeholder(tf.int32, shape=(1,))
            model = InceptionModel(num_classes)

            attack = SPSA(model)
            x_adv = attack.generate(
                x_input, y=y_label, epsilon=epsilon, num_steps=10,
                early_stop_loss_threshold=-1., batch_size=32, spsa_iters=1,
                is_debug=True)

            # Run computation
            saver = tf.train.Saver(slim.get_model_variables())
            session_creator = tf.train.ChiefSessionCreator(
                scaffold=tf.train.Scaffold(saver=saver),
                checkpoint_filename_with_path=FLAGS.checkpoint_path,
                master=FLAGS.master)

            with tf.train.MonitoredSession(
                    session_creator=session_creator) as sess:
                for i in xrange(num_images):
                    adv_image = sess.run(x_adv, feed_dict={
                        x_input: np.expand_dims(images[i], axis=0),
                        y_label: np.expand_dims(labels[i], axis=0),
                    })
                    diff = adv_image - images[i]
                    assert np.max(np.abs(diff)) < epsilon + 1e-4
                    assert np.max(adv_image < 1. + 1e-4)
                    assert np.min(adv_image > -1e-4)
示例#7
0
def iterate_through_cwl2_attacks():
    tf.logging.set_verbosity(tf.logging.INFO)
    input_dir = FLAGS.input_image_dir
    metadata_file_path = FLAGS.metadata_file_path
    num_images = len(os.listdir(input_dir))
    batch_shape = (num_images, 299, 299, 3)
    num_classes = 1001
    batch_size = attack_name_to_params[ATTACKS.CARLINI_WAGNER]['batch_size']
    images, labels, target_classes = load_images(input_dir, metadata_file_path, batch_shape,
                                                 num_classes)

    list_param_dict = expand_param_dict(
        attack_name_to_params[ATTACKS.CARLINI_WAGNER],
        attack_name_to_configurable_params[ATTACKS.CARLINI_WAGNER]
    )

    save_dir = 'saves'
    os.makedirs(save_dir, exist_ok=True)

    for idx, params in enumerate(list_param_dict):
        tf.reset_default_graph()

        logger.info('Running attack with parameters: {}'.format(params))
        logger.info('Current index of parameters: {}/{}'.format(idx, len(list_param_dict)))

        # Get save path
        adv_imgs_save_path = get_attack_images_filename_prefix(
            attack_name=ATTACKS.CARLINI_WAGNER,
            params=params,
            model='inception',
            targeted_prefix='targeted'
        )
        adv_imgs_save_path = os.path.join(save_dir, adv_imgs_save_path)

        # Run inference
        graph = tf.Graph()
        with graph.as_default():
            sess = tf.Session(graph=graph)
            # Prepare graph
            x_input = tf.placeholder(tf.float32, shape=(batch_size,) + batch_shape[1:])
            y_label = tf.placeholder(tf.int32, shape=(batch_size, num_classes))
            y_target = tf.placeholder(tf.int32, shape=(batch_size, num_classes))
            model = InceptionModel(num_classes)

            cwl2 = True
            if cwl2:
                attack = CarliniWagnerL2(model=model, sess=sess)
                x_adv = attack.generate(x_input, y_target=y_target, **params)
            else:
                attack = SPSA(model=model)
                x_adv = attack.generate(x_input, y_target=y_label, epsilon=4. / 255, num_steps=30,
                                        early_stop_loss_threshold=-1., batch_size=32, spsa_iters=16,
                                        is_debug=True)

            logits = model.get_logits(x_input)
            acc = _top_k_accuracy(logits, tf.argmax(y_label, axis=1), k=1)
            success_rate = _top_k_accuracy(logits, tf.argmax(y_target, axis=1), k=1)

            # Run computation
            saver = tf.train.Saver(slim.get_model_variables())
            saver.restore(sess, save_path=FLAGS.checkpoint_path)

            list_adv_images = []

            if num_images % batch_size == 0:
                num_batches = int(num_images / batch_size)
            else:
                num_batches = int(num_images / batch_size + 1)

            for i in tqdm.tqdm(range(num_batches)):
                feed_dict_i = {x_input: images[i * batch_size:(i + 1) * batch_size],
                               y_target: target_classes[i * batch_size:(i + 1) * batch_size]}
                adv_img = sess.run(x_adv, feed_dict=feed_dict_i)
                list_adv_images.append(adv_img)

            adv_images = np.concatenate((list_adv_images))
            np.save(adv_imgs_save_path, adv_images)

            acc_store = []
            succ_store = []
            for i in tqdm.tqdm(range(num_batches)):
                feed_dict_i = {x_input: adv_images[i * batch_size:(i + 1) * batch_size],
                               y_target: target_classes[i * batch_size:(i + 1) * batch_size],
                               y_label: labels[i * batch_size:(i + 1) * batch_size]}
                succ_batch, acc_batch = sess.run([success_rate, acc],
                                                 feed_dict=feed_dict_i)
                acc_store.extend(acc_batch)
                succ_store.extend(succ_batch)

            logger.info('Accuracy is: {:.4f}'.format(np.mean(acc_store)))
            logger.info('Success Rate is: {:.4f}'.format(np.mean(succ_store)))
示例#8
0
def run(args, restrict=True):
    if restrict:
        # Restrict the visible GPUs to the one for this subprocess
        id = np.int(multiprocessing.current_process().name.split("-")[1])
        os.environ["CUDA_VISIBLE_DEVICES"] = str(id - 1)

    # Load Parameters
    dataset = args[0]
    epsilon = float(args[1])
    mode = args[2]
    K = int(args[3])

    fname = dataset + "/" + str(epsilon) + "_" + mode + "_" + str(K)

    # Configure Keras/Tensorflow
    Keras.clear_session()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    set_session(tf.Session(config=config))

    sess = Keras.get_session()
    Keras.set_learning_phase(False)

    # Fix Random Seeds
    np.random.seed(1)
    tf.set_random_seed(
        1
    )  #Having this before keras.clear_session() causes it it hang for some reason

    # Load Model/Data and setup SPSA placeholders
    N = 50
    if dataset == "MNIST":
        # Base Model
        base_model = MNISTModel("../1-Models/MNIST")
        data = MNIST()
        x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
        # SPSA
        shape_spsa = (1, 28, 28, 1)
        x_spsa = tf.placeholder(tf.float32, shape=shape_spsa)
    elif dataset == "CIFAR":
        # Base Model
        base_model = CIFARModel("../1-Models/CIFAR")
        data = CIFAR()
        x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
        # SPSA
        shape_spsa = (1, 32, 32, 3)
        x_spsa = tf.placeholder(tf.float32, shape=shape_spsa)
    y_spsa = tf.placeholder(tf.int32)

    # Load the hidden representations of the real and adversarial examples from the training set
    x_train_real = np.squeeze(
        np.load("../3-Representation/" + dataset + "/train_" + mode + ".npy"))
    x_train_adv = np.squeeze(
        np.load("../3-Representation/" + dataset + "/train_adv_" + mode +
                ".npy"))

    n_train = x_train_real.shape[0]
    n_train_adv = x_train_adv.shape[0]
    x_train = np.float32(np.vstack((x_train_real, x_train_adv)))
    #print("Bounds ", np.max(np.abs(x_train)))
    y_train = np.float32(
        np.hstack((-1.0 * np.ones(n_train), np.ones(n_train_adv))))

    # Create the defended model
    model_defended = DefendedModel(base_model, x_train, y_train, K)
    defended_logits = model_defended.get_logits(x)

    # Configure the attack
    attack = SPSA(model_defended, back="tf", sess=sess)
    with tf.name_scope("Attack") as scope:
        gen = attack.generate(x_spsa,
                              y=y_spsa,
                              epsilon=epsilon,
                              is_targeted=False,
                              num_steps=100,
                              batch_size=2048,
                              early_stop_loss_threshold=-5.0)

    # Run the attack
    f = open(fname + ".txt", "w")

    sample = np.random.choice(data.test_data.shape[0], N, replace=False)
    x_sample = data.test_data[sample]
    y_sample = np.argmax(data.test_labels[sample], axis=1)

    logits_nat = sess.run(defended_logits, {x: x_sample})
    f.write("Accuracy on Natural Images: " +
            str(np.mean(np.argmax(logits_nat, axis=1) == y_sample)) + "\n")

    pred_adv = -1.0 * np.ones((N))
    for i in range(N):
        x_real = x_sample[i].reshape(shape_spsa)
        x_adv = sess.run(gen, {x_spsa: x_real, y_spsa: y_sample[i]})
        pred_adv[i] = np.argmax(sess.run(defended_logits, {x: x_adv}))

    f.write("Accuracy on Adversarial Images: " +
            str(np.mean(pred_adv == y_sample)))
    f.close()
def run(args, restrict=True):
    if restrict:
        # Restrict the visible GPUs to the one for this subprocess
        id = np.int(multiprocessing.current_process().name.split("-")[1])
        os.environ["CUDA_VISIBLE_DEVICES"] = str(id - 1)

    # Load Parameters
    dataset = args[0]
    epsilon = float(args[1])
    mode = args[2]
    K = int(args[3])
    bias = float(args[4])

    fname = dataset + "/" + str(epsilon) + "_" + mode + "_" + str(
        K) + "_" + str(bias)

    # Configure Keras/Tensorflow
    Keras.clear_session()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    set_session(tf.Session(config=config))

    sess = Keras.get_session()
    Keras.set_learning_phase(False)

    # Fix Random Seeds
    np.random.seed(1)
    tf.set_random_seed(
        1
    )  #Having this before keras.clear_session() causes it it hang for some reason

    # Load Model/Data and setup SPSA placeholders
    N = 1000
    if dataset == "MNIST":
        # Base Model
        base_model = MNISTModel("../1-Models/MNIST")
        data = MNIST()
        x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
        # SPSA
        shape_spsa = (1, 28, 28, 1)
        x_spsa = tf.placeholder(tf.float32, shape=shape_spsa)
    elif dataset == "CIFAR":
        # Base Model
        base_model = CIFARModel("../1-Models/CIFAR")
        data = CIFAR()
        x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
        # SPSA
        shape_spsa = (1, 32, 32, 3)
        x_spsa = tf.placeholder(tf.float32, shape=shape_spsa)
    y_spsa = tf.placeholder(tf.int32)

    # Load the hidden representations of the real and adversarial examples from the training set
    x_train_real = np.squeeze(
        np.load("../3-Representation/" + dataset + "/train_" + mode + ".npy"))
    x_train_adv = np.squeeze(
        np.load("../3-Representation/" + dataset + "/train_adv_" + mode +
                ".npy"))

    n_train = x_train_real.shape[0]
    n_train_adv = x_train_adv.shape[0]
    x_train = np.float32(np.vstack((x_train_real, x_train_adv)))
    #print("Bounds ", np.max(np.abs(x_train)))
    y_train = np.float32(
        np.hstack((-1.0 * np.ones(n_train), np.ones(n_train_adv))))

    # Create the defended model
    model_defended = DefendedModel(base_model, x_train, y_train, K, bias=bias)
    defended_logits = model_defended.get_logits(x)

    # Get the predictions on the original images
    labels = np.argmax(data.test_labels[:N], axis=1)
    logits_real = sess.run(defended_logits, {x: data.test_data[:N]})
    fp = (np.argmax(logits_real,
                    axis=1) == 10)  #False positives of the defense
    pred_undefended = np.argmax(np.delete(logits_real, -1, axis=1),
                                axis=1)  #Original model prediction

    # Configure the attack
    attack = SPSA(model_defended, back="tf", sess=sess)
    with tf.name_scope("Attack") as scope:
        gen = attack.generate(x_spsa,
                              y_target=y_spsa,
                              epsilon=epsilon,
                              is_targeted=True,
                              num_steps=100,
                              batch_size=2048,
                              early_stop_loss_threshold=-5.0)

    # Run the attack
    pred_adv = -1.0 * np.ones((N, 10))
    for i in range(N):
        if i % 10 == 0:
            print(fname, " ", i)
            out = {}
            out["FP"] = fp
            out["Labels"] = labels
            out["UndefendedPrediction"] = pred_undefended
            out["AdversarialPredictions"] = pred_adv
            file = open(fname, "wb")
            pickle.dump(out, file)
            file.close()

        x_real = data.test_data[i].reshape(shape_spsa)

        # Try a targeted attack for each class other than the original network prediction and the adversarial class
        for y in range(10):
            if y != pred_undefended[i]:
                x_adv = sess.run(gen, {x_spsa: x_real, y_spsa: y})
                pred_adv[i,
                         y] = np.argmax(sess.run(defended_logits, {x: x_adv}))

    out = {}
    out["FP"] = fp
    out["Labels"] = labels
    out["UndefendedPrediction"] = pred_undefended
    out["AdversarialPredictions"] = pred_adv
    file = open(fname, "wb")
    pickle.dump(out, file)
    file.close()

    analysis(fname)
spsa_params = {
    'eps': float(sys.argv[1]),
    'learning_rate': 0.01,
    'delta': 0.01,
    'spsa_samples': 128,
    'spsa_iters': 1,
    'nb_iter': 100,
    'clip_min': 0.,
    'clip_max': 1.
}

spsa_attack = SPSA(wrap_source, sess=sess)
x = tf.placeholder(dtype=tf.float32, shape=(None, 32, 32, 3))
y = tf.placeholder(dtype=tf.float32, shape=(None, 10))
x_adv = spsa_attack.generate(x, y, **spsa_params)
X_adv_source = np.zeros((len(indices_test), 32, 32, 3))
for i in range(0, len(indices_test)):
    X_adv_source[i] = sess.run(x_adv,
                               feed_dict={
                                   x: X_test[indices_test[i:(i + 1)]],
                                   y: Y_test[indices_test[i:(i + 1)]]
                               })

print("metrics source model")
print(metrics(model_source, X_adv_source, X_test, pred_source, indices_test))
print("metrics base model")
print(metrics(model, X_adv_source, X_test, pred_base, indices_test))

pred_source_adv = np.argmax(model_source.predict(X_adv_source), axis=1)
pred_adv_basefromsource = np.argmax(model.predict(X_adv_source), axis=1)
示例#11
0
def craft_one_type(sess,
                   model,
                   X,
                   Y,
                   dataset,
                   attack,
                   batch_size,
                   log_path=None,
                   fp_path=None,
                   model_logits=None):
    """
    TODO
    :param sess:
    :param model:
    :param X:
    :param Y:
    :param dataset:
    :param attack:
    :param batch_size:
    :return:
    """
    print("entered")
    if not log_path is None:
        PATH_DATA = log_path

    if attack == 'fgsm':
        # FGSM attack
        print('Crafting fgsm adversarial samples...')
        X_adv = fast_gradient_sign_method(sess,
                                          model,
                                          X,
                                          Y,
                                          eps=ATTACK_PARAMS[dataset]['eps'],
                                          clip_min=CLIP_MIN,
                                          clip_max=CLIP_MAX,
                                          batch_size=batch_size)
    elif attack == 'adapt-fgsm':
        # Adaptive FGSM attack
        print('Crafting fgsm adversarial samples...')

        X_adv = adaptive_fast_gradient_sign_method(
            sess,
            model,
            X,
            Y,
            eps=ATTACK_PARAMS[dataset]['eps'],
            clip_min=CLIP_MIN,
            clip_max=CLIP_MAX,
            batch_size=batch_size,
            log_dir=fp_path,
            model_logits=model_logits,
            dataset=dataset)
    elif attack == 'adapt-bim-b':
        # BIM attack
        print('Crafting %s adversarial samples...' % attack)
        X_adv = adaptive_basic_iterative_method(
            sess,
            model,
            X,
            Y,
            eps=ATTACK_PARAMS[dataset]['eps'],
            eps_iter=ATTACK_PARAMS[dataset]['eps_iter'],
            clip_min=CLIP_MIN,
            clip_max=CLIP_MAX,
            batch_size=batch_size,
            log_dir=fp_path,
            model_logits=model_logits,
            dataset=dataset)
    elif attack in ['bim-a', 'bim-b']:
        # BIM attack
        print('Crafting %s adversarial samples...' % attack)
        its, results = basic_iterative_method(
            sess,
            model,
            X,
            Y,
            eps=ATTACK_PARAMS[dataset]['eps'],
            eps_iter=ATTACK_PARAMS[dataset]['eps_iter'],
            clip_min=CLIP_MIN,
            clip_max=CLIP_MAX,
            batch_size=batch_size)
        if attack == 'bim-a':
            # BIM-A
            # For each sample, select the time step where that sample first
            # became misclassified
            X_adv = np.asarray([results[its[i], i] for i in range(len(Y))])
        else:
            # BIM-B
            # For each sample, select the very last time step
            X_adv = results[-1]
    elif attack == 'jsma':
        # JSMA attack
        print('Crafting jsma adversarial samples. This may take > 5 hours')
        X_adv = saliency_map_method(sess,
                                    model,
                                    X,
                                    Y,
                                    theta=1,
                                    gamma=0.1,
                                    clip_min=CLIP_MIN,
                                    clip_max=CLIP_MAX)
    elif attack == 'cw-l2':
        # C&W attack
        print(
            'Crafting %s examples. This takes > 5 hours due to internal grid search'
            % attack)
        image_size = ATTACK_PARAMS[dataset]['image_size']
        num_channels = ATTACK_PARAMS[dataset]['num_channels']
        num_labels = ATTACK_PARAMS[dataset]['num_labels']
        cw_attack = CarliniL2(sess,
                              model,
                              image_size,
                              num_channels,
                              num_labels,
                              batch_size=batch_size)
        X_adv = cw_attack.attack(X, Y)
    elif attack == 'cw-fp':
        # C&W attack to break LID detector
        print(
            'Crafting %s examples. This takes > 5 hours due to internal grid search'
            % attack)
        image_size = ATTACK_PARAMS[dataset]['image_size']
        num_channels = ATTACK_PARAMS[dataset]['num_channels']
        num_labels = ATTACK_PARAMS[dataset]['num_labels']
        cw_attack = CarliniFP_2vars(sess,
                                    model,
                                    image_size,
                                    num_channels,
                                    num_labels,
                                    batch_size=batch_size,
                                    fp_dir=fp_path)
        X_adv = cw_attack.attack(X, Y)

    elif attack == 'spsa':
        binary_steps = 1
        batch_shape = X.shape
        X_input = tf.placeholder(tf.float32, shape=(1, ) + batch_shape[1:])
        Y_label = tf.placeholder(tf.int32, shape=(1, ))
        alpha = tf.placeholder(tf.float32, shape=(1, ))

        num_samples = np.shape(X)[0]
        # X = (X - np.argmin(X))/(np.argmax(X)-np.argmin(X))
        _min = np.min(X)
        _max = np.max(X)
        print(_max, _min)
        print(tf.trainable_variables())
        filters = sess.run('conv1/kernel:0')
        biases = 0.0 * sess.run('conv1/bias:0')
        shift_model = Sequential()
        if (dataset == 'mnist'):
            shift_model.add(
                Conv2D(32,
                       kernel_size=(3, 3),
                       activation=None,
                       input_shape=(1, 28, 28)))
        else:
            shift_model.add(
                Conv2D(32,
                       kernel_size=(3, 3),
                       activation=None,
                       input_shape=(3, 32, 32)))

        X_input_2 = tf.placeholder(tf.float32,
                                   shape=(None, ) + batch_shape[1:])

        correction_term = shift_model(X_input_2)
        if (dataset == 'mnist'):
            X_correction = -0.5 * np.ones(
                (1, 1, 28, 28)
            )  # We will shift the image up by 0.5, so this is the correction
        else:
            X_correction = -0.5 * np.ones(
                (1, 3, 32, 32)
            )  # We will shift the image up by 0.5, so this is the correction

        # for PGD

        shift_model.layers[0].set_weights([filters, biases])
        bias_correction_terms = (sess.run(correction_term,
                                          feed_dict={X_input_2: X_correction}))
        for i in range(32):
            biases[i] = bias_correction_terms[0, i, 0, 0]
        _, acc = model.evaluate(X, Y, batch_size=batch_size, verbose=0)
        print("Model accuracy on the test set: %0.2f%%" % (100.0 * acc))
        original_biases = model.layers[0].get_weights()[1]
        original_weights = model.layers[0].get_weights()[0]
        model.layers[0].set_weights(
            [original_weights, original_biases + biases])
        #Correct model for input shift

        X = X + 0.5  #shift input to make it >=0
        _, acc = model.evaluate(X, Y, batch_size=batch_size, verbose=0)
        print("Model accuracy on the test set: %0.2f%%" % (100.0 * acc))
        # check accuracy post correction of input and model
        print('Crafting %s examples. Using Cleverhans' % attack)
        image_size = ATTACK_PARAMS[dataset]['image_size']
        num_channels = ATTACK_PARAMS[dataset]['num_channels']
        num_labels = ATTACK_PARAMS[dataset]['num_labels']

        from cleverhans.utils_keras import KerasModelWrapper
        wrapped_model = KerasModelWrapper(model)

        if dataset == "mnist":
            wrapped_model.nb_classes = 10
        elif dataset == "cifar":
            wrapped_model.nb_classes = 10
        else:
            wrapped_model.nb_classes = 10

        real_batch_size = X.shape[0]
        X_adv = None

        spsa = SPSA(wrapped_model, back='tf', sess=sess)
        spsa_params = {
            "epsilon": ATTACK_PARAMS[dataset]['eps'],
            'num_steps': 100,
            'spsa_iters': 1,
            'early_stop_loss_threshold': None,
            'is_targeted': False,
            'is_debug': False
        }
        X_adv_spsa = spsa.generate(X_input,
                                   alpha=alpha,
                                   y=Y_label,
                                   fp_path=fp_path,
                                   **spsa_params)

        for i in range(num_samples):

            # rescale to format TF wants

            #X_i_norm = (X[i] - _min)/(_max-_min)

            X_i_norm = X[i]
            # Run attack
            best_res = None
            ALPHA = np.ones(1) * 0.1
            lb = 1.0e-2
            ub = 1.0e2
            for j in range(binary_steps):
                res = sess.run(X_adv_spsa,
                               feed_dict={
                                   X_input: np.expand_dims(X_i_norm, axis=0),
                                   Y_label: np.array([np.argmax(Y[i])]),
                                   alpha: ALPHA
                               })
                if (dataset == 'mnist'):
                    X_place = tf.placeholder(tf.float32, shape=[1, 1, 28, 28])
                else:
                    X_place = tf.placeholder(tf.float32, shape=[1, 3, 32, 32])
                pred = model(X_place)
                model_op = sess.run(pred, feed_dict={X_place: res})

                if (not np.argmax(model_op) == np.argmax(Y[i, :])):
                    lb = ALPHA[0]
                else:
                    ub = ALPHA[0]
                ALPHA[0] = 0.5 * (lb + ub)
                print(ALPHA)
                if (best_res is None):
                    best_res = res
                else:
                    if (not np.argmax(model_op) == np.argmax(Y[i, :])):
                        best_res = res
                        pass

            # Rescale result back to our scale

            if (i == 0):
                X_adv = best_res
            else:
                X_adv = np.concatenate((X_adv, best_res), axis=0)

        _, acc = model.evaluate(X_adv, Y, batch_size=batch_size, verbose=0)
        print("Model accuracy on the adversarial test set: %0.2f%%" %
              (100.0 * acc))
        _, acc = model.evaluate(X, Y, batch_size=batch_size, verbose=0)
        print("Model accuracy on the test set: %0.2f%%" % (100.0 * acc))

        #Revert model to original
        model.layers[0].set_weights([original_weights, original_biases])
        #Revert adv shift
        X_adv = X_adv - 0.5
        X = X - 0.5  #Not used but just for logging purposes
    elif attack == 'adapt-pgd':
        binary_steps = 1
        rand_starts = 2
        batch_shape = X.shape
        X_input = tf.placeholder(tf.float32, shape=(1, ) + batch_shape[1:])
        Y_label = tf.placeholder(tf.int32, shape=(1, ))
        alpha = tf.placeholder(tf.float32, shape=(1, ))

        num_samples = np.shape(X)[0]
        # X = (X - np.argmin(X))/(np.argmax(X)-np.argmin(X))
        _min = np.min(X)
        _max = np.max(X)
        print(_max, _min)
        print(tf.trainable_variables())
        filters = sess.run('conv1/kernel:0')
        biases = 0.0 * sess.run('conv1/bias:0')
        shift_model = Sequential()
        if (dataset == 'mnist'):
            shift_model.add(
                Conv2D(32,
                       kernel_size=(3, 3),
                       activation=None,
                       input_shape=(1, 28, 28)))
        else:
            shift_model.add(
                Conv2D(32,
                       kernel_size=(3, 3),
                       activation=None,
                       input_shape=(3, 32, 32)))

        X_input_2 = tf.placeholder(tf.float32,
                                   shape=(None, ) + batch_shape[1:])

        correction_term = shift_model(X_input_2)
        if (dataset == 'mnist'):
            X_correction = -0.5 * np.ones(
                (1, 1, 28, 28)
            )  # We will shift the image up by 0.5, so this is the correction
        else:
            X_correction = -0.5 * np.ones(
                (1, 3, 32, 32)
            )  # We will shift the image up by 0.5, so this is the correction

        # for PGD

        shift_model.layers[0].set_weights([filters, biases])
        bias_correction_terms = (sess.run(correction_term,
                                          feed_dict={X_input_2: X_correction}))
        for i in range(32):
            biases[i] = bias_correction_terms[0, i, 0, 0]
        _, acc = model.evaluate(X, Y, batch_size=batch_size, verbose=0)
        print("Model accuracy on the test set: %0.2f%%" % (100.0 * acc))
        original_biases = model.layers[0].get_weights()[1]
        original_weights = model.layers[0].get_weights()[0]
        model.layers[0].set_weights(
            [original_weights, original_biases + biases])
        #Correct model for input shift

        X = X + 0.5  #shift input to make it >=0

        _, acc = model.evaluate(X, Y, batch_size=batch_size, verbose=0)
        print("Model accuracy on the test set: %0.2f%%" % (100.0 * acc))
        # check accuracy post correction of input and model
        print('Crafting %s examples. Using Cleverhans' % attack)
        image_size = ATTACK_PARAMS[dataset]['image_size']
        num_channels = ATTACK_PARAMS[dataset]['num_channels']
        num_labels = ATTACK_PARAMS[dataset]['num_labels']

        from cleverhans.utils_keras import KerasModelWrapper
        wrapped_model = KerasModelWrapper(model)

        if dataset == "mnist":
            wrapped_model.nb_classes = 10
        elif dataset == "cifar":
            wrapped_model.nb_classes = 10
        else:
            wrapped_model.nb_classes = 10

        real_batch_size = X.shape[0]
        X_adv = None

        pgd = MadryEtAl(wrapped_model, back='tf', sess=sess)
        X_adv_pgd, adv_loss_fp = pgd.generate(X_input,
                                              eps=0.3,
                                              eps_iter=0.02,
                                              clip_min=0.0,
                                              clip_max=1.0,
                                              nb_iter=20,
                                              rand_init=True,
                                              fp_path=fp_path,
                                              alpha=alpha)

        for i in range(num_samples):
            # rescale to format TF wants

            #X_i_norm = (X[i] - _min)/(_max-_min)

            X_i_norm = X[i]
            # Run attack
            best_res = None
            best_res_loss = 1000000.0
            ALPHA = np.ones(1) * 0.1
            lb = 1.0e-2
            ub = 1.0e2
            for j in range(binary_steps):
                bin_flag = 0
                for jj in range(rand_starts):

                    [res, res_loss] = sess.run(
                        [X_adv_pgd, adv_loss_fp],
                        feed_dict={
                            X_input: np.expand_dims(X[i], axis=0),
                            Y_label: np.array([np.argmax(Y[i])]),
                            alpha: ALPHA
                        })

                    if (dataset == 'mnist'):
                        X_place = tf.placeholder(tf.float32,
                                                 shape=[1, 1, 28, 28])
                    else:
                        X_place = tf.placeholder(tf.float32,
                                                 shape=[1, 3, 32, 32])

                    pred = model(X_place)
                    model_op = sess.run(pred, feed_dict={X_place: res})

                    if (best_res is None):
                        best_res = res
                    else:
                        if ((not np.argmax(model_op) == np.argmax(Y[i, :]))
                                and res_loss < best_res_loss):
                            best_res = res
                            best_res_loss = res_loss
                            bin_flag = 1
                            pass
                if (bin_flag == 1):
                    lb = ALPHA[0]
                else:
                    ub = ALPHA[0]
                ALPHA[0] = 0.5 * (lb + ub)
                print(ALPHA)
            # Rescale result back to our scale

            if (i == 0):
                X_adv = best_res
            else:
                X_adv = np.concatenate((X_adv, best_res), axis=0)

        _, acc = model.evaluate(X_adv, Y, batch_size=batch_size, verbose=0)
        print("Model accuracy on the adversarial test set: %0.2f%%" %
              (100.0 * acc))
        _, acc = model.evaluate(X, Y, batch_size=batch_size, verbose=0)
        print("Model accuracy on the test set: %0.2f%%" % (100.0 * acc))

        #Revert model to original
        model.layers[0].set_weights([original_weights, original_biases])
        #Revert adv shift
        X_adv = X_adv - 0.5
        X = X - 0.5  #Not used but just for logging purposes

        pass

    if ("adapt" in attack or "fp" in attack or "spsa" in attack):
        [m, _, _, _] = (np.shape(X_adv))
        cropped_X_adv = []
        cropped_Y = []
        cropped_X = []
        if (dataset == 'mnist'):
            X_place = tf.placeholder(tf.float32, shape=[1, 1, 28, 28])
            pred = model(X_place)
        else:
            X_place = tf.placeholder(tf.float32, shape=[1, 3, 32, 32])
            pred = model(X_place)
        for i in range(m):
            logits_op = sess.run(pred,
                                 feed_dict={X_place: X_adv[i:i + 1, :, :, :]})
            if (not np.argmax(logits_op) == np.argmax(Y[i, :])):
                cropped_Y.append(Y[i, :])
                cropped_X_adv.append(X_adv[i, :, :, :])
                cropped_X.append(X[i, :, :, :])
        X_adv = np.array(cropped_X_adv)
        X = np.array(cropped_X)
        Y = np.array(cropped_Y)

        f = open(
            os.path.join(log_path, 'Random_Test_%s_%s.p' % (dataset, attack)),
            'w')

        pickle.dump({"adv_input": X, "adv_labels": Y}, f)
        f.close()

    #np.save(os.path.join(PATH_DATA, 'Adv_%s_%s.npy' % (dataset, attack)), X_adv)
    f = open(os.path.join(log_path, 'Adv_%s_%s.p' % (dataset, attack)), 'w')

    pickle.dump({"adv_input": X_adv, "adv_labels": Y}, f)
    f.close()
    _, acc = model.evaluate(X, Y, batch_size=batch_size, verbose=0)
    print("Model accuracy on the test set: %0.2f%%" % (100.0 * acc))
    l2_diff = np.linalg.norm(X_adv.reshape((len(X), -1)) - X.reshape(
        (len(X), -1)),
                             axis=1).mean()
    print("Average L-2 perturbation size of the %s attack: %0.2f" %
          (attack, l2_diff))
    if (("adapt" in attack) or ("cw-fp" in attack)):
        return (X, X_adv, Y)
    else:
        print(Y.shape)
        return (X_adv, Y)
示例#12
0
def run(args, restrict=True):
    if restrict:
        # Restrict the visible GPUs to the one for this subprocess
        id = np.int(multiprocessing.current_process().name.split("-")[1])
        os.environ["CUDA_VISIBLE_DEVICES"] = str(id - 1)

    # Load Parameters
    dataset = args[0]
    epsilon = float(args[1])
    mode = args[2]
    K = int(args[3])

    fname = dataset + "/" + str(epsilon) + "_" + mode + "_" + str(K)

    # Configure Keras/Tensorflow
    Keras.clear_session()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    set_session(tf.Session(config=config))

    sess = Keras.get_session()
    Keras.set_learning_phase(False)

    # Fix Random Seeds
    np.random.seed(1)
    tf.set_random_seed(
        1
    )  #Having this before keras.clear_session() causes it it hang for some reason

    # Load Model/Data and setup SPSA placeholders
    N = 500
    if dataset == "MNIST":
        # Base Model
        base_model = MNISTModel("../1-Models/MNIST")
        data = MNIST()
        x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
        # SPSA
        shape_spsa = (1, 28, 28, 1)
        x_spsa = tf.placeholder(tf.float32, shape=shape_spsa)
    elif dataset == "CIFAR":
        # Base Model
        base_model = CIFARModel("../1-Models/CIFAR")
        data = CIFAR()
        x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
        # SPSA
        shape_spsa = (1, 32, 32, 3)
        x_spsa = tf.placeholder(tf.float32, shape=shape_spsa)
    y_spsa = tf.placeholder(tf.int32)

    # Load the hidden representations of the real and adversarial examples from the training set
    x_train_real = np.squeeze(
        np.load("../3-Representation/" + dataset + "/train_" + mode + ".npy"))
    x_train_adv = np.squeeze(
        np.load("../3-Representation/" + dataset + "/train_adv_" + mode +
                ".npy"))

    n_train = x_train_real.shape[0]
    n_train_adv = x_train_adv.shape[0]
    x_train = np.float32(np.vstack((x_train_real, x_train_adv)))
    y_train = np.float32(
        np.hstack((-1.0 * np.ones(n_train), np.ones(n_train_adv))))

    # Create the defended model
    defense = DefendedModel(base_model, x_train, y_train, K)
    get_votes = defense.get_votes(
        x)  # Should this be get_votes, introducing separate method
    get_logits = defense.get_logits(x)

    # Configure the attack
    attack = SPSA(defense, back="tf", sess=sess)
    with tf.name_scope("Attack") as scope:
        gen = attack.generate(x_spsa,
                              y=y_spsa,
                              epsilon=0.01,
                              is_targeted=False,
                              num_steps=100,
                              batch_size=2048,
                              early_stop_loss_threshold=-0.05)

    # Run the test
    sample = np.random.choice(data.test_data.shape[0], N, replace=False)
    x_sample = data.test_data[sample]
    y_sample = np.argmax(data.test_labels[sample], axis=1)

    votes = sess.run(get_votes, {x: x_sample})

    count = 0
    bound = 0
    correct = 0
    for i in range(N):
        if votes[i, 0] > 0:
            count += 1
            # Project via an adversarially attack on the votest
            #x_real = x_sample[i].reshape(shape_spsa)
            #x_adv = sess.run(gen, {x_spsa: x_real, y_spsa: 0}) #TODO: not adv, is projected
            x_proj = sess.run(get_logits, {x: x_sample[i]})
            projection_labels = np.argmax(x_proj, axis=1)
            successful_projections = projection_labels[np.nonzero(
                projection_labels * (projection_labels != 10))]

            # Check if the projection was a success
            if successful_projections.shape[0] != 0:
                bound += 1

            # Check if the projection is predicted correctly
            if y_sample[i] == np.argmax(sess.run(get_logits, {x: x_proj}),
                                        axis=1)[0]:
                correct += 1

    print("FP Count: ", count)
    print("FP Recovery in Bounds: ", bound / count)
    print("FP Recovery Accuracy: ", correct / count)
def eval_robustness(ARGS, verbose=True):
    #############################################
    # Load pre-trained model
    #############################################

    if verbose:
        print('\n- Loading pre-trained model...')

    # Build evaluation graph
    eval_graph = tf.Graph()
    config = tf.ConfigProto(log_device_placement=False,
                            allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    sess = tf.Session(graph=eval_graph, config=config)

    # Define input TF placeholder
    with eval_graph.as_default():
        with tf.device('/gpu:0'):
            # Define placeholders
            with tf.name_scope('Placeholders'):
                x = tf.placeholder(dtype=tf.float32,
                                   shape=input_shape,
                                   name='inputs')
                y = tf.placeholder(dtype=tf.float32,
                                   shape=(None, n_classes),
                                   name='labels')
                is_training = tf.placeholder_with_default(False,
                                                          shape=(),
                                                          name='is-training')

            # Define model
            with tf.name_scope('Model'):
                model = Model(nb_classes=n_classes,
                              input_shape=input_shape,
                              is_training=is_training)

            # Define forward-pass
            with tf.name_scope('Logits'):
                logits = model.get_logits(x)
            with tf.name_scope('Probs'):
                preds = tf.nn.softmax(logits)

            # Restore the pre-trained model
            with sess.as_default():
                saver = tf.train.Saver()
                saver.restore(sess, ARGS.restore_path + '/model.ckpt')

            # Define accuracy ops
            with tf.name_scope('Accuracy'):
                ground_truth = tf.argmax(y, axis=1)
                predicted_label = tf.argmax(preds, axis=1)
                correct_prediction = tf.equal(predicted_label, ground_truth)
                clean_acc = tf.reduce_mean(tf.to_float(correct_prediction),
                                           name='accuracy')

            # Define PGD adversary
            if ARGS.attack == 'PGD':
                if verbose:
                    print('\n- Building {:s} attack graph...'.format(
                        ARGS.attack))

                with tf.name_scope('PGD-Attacker'):
                    pgd_params = {
                        'ord': np.inf,
                        'y': y,
                        'eps': ARGS.eps / 255,
                        'eps_iter': ARGS.eps_iter / 255,
                        'nb_iter': ARGS.nb_iter,
                        'rand_init': ARGS.rand_init,
                        'rand_minmax': ARGS.eps / 255,
                        'clip_min': 0.,
                        'clip_max': 1.,
                        'sanity_checks': True
                    }

                    pgd = ProjectedGradientDescent(model, sess=None)
                    adv_x = pgd.generate(x, **pgd_params)

            # Define SPSA adversary
            elif ARGS.attack == 'SPSA':
                if verbose:
                    print('\n- Building {:s} attack graph...'.format(
                        ARGS.attack))

                with tf.name_scope('PGD-Attacker'):
                    spsa_params = {
                        'y': y,
                        'eps': ARGS.eps / 255,
                        'nb_iter': ARGS.nb_iter,
                        'spsa_samples': ARGS.spsa_samples,
                        'spsa_iters': ARGS.spsa_iters,
                        'clip_min': 0.,
                        'clip_max': 1.,
                        'learning_rate': ARGS.spsa_lr,
                        'delta': ARGS.spsa_delta
                    }

                    spsa = SPSA(model, sess=sess)
                    adv_x = spsa.generate(x, **spsa_params)
            else:
                raise NotImplementedError

            with tf.name_scope('Logits'):
                adv_logits = model.get_logits(adv_x)
            with tf.name_scope('Probs'):
                adv_preds = tf.nn.softmax(adv_logits)

            adv_loss = tf.nn.softmax_cross_entropy_with_logits(
                logits=adv_logits, labels=y)
            adv_predicted_label = tf.argmax(adv_preds, axis=1)
            correct_prediction = tf.equal(adv_predicted_label, ground_truth)
            adv_accuracy = tf.reduce_mean(tf.to_float(correct_prediction),
                                          name='adv-accuracy')
            is_adv_example = tf.not_equal(ground_truth, adv_predicted_label)

    #############################################
    # Run evaluation
    #############################################

    if verbose:
        print('\n- Running robustness evaluation against {:s} attacker...\n'.
              format(ARGS.attack))

    if ARGS.attack == 'PGD':
        clean, adv_mean, adv_worstcase = run_pgd_eval(x,
                                                      y,
                                                      is_training,
                                                      sess,
                                                      adv_testloader,
                                                      clean_acc,
                                                      adv_accuracy,
                                                      adv_loss,
                                                      is_adv_example,
                                                      ARGS,
                                                      save_loss_dist=False,
                                                      verbose=verbose)

    elif ARGS.attack == 'SPSA':
        clean, adv_mean = run_spsa_eval(x,
                                        y,
                                        is_training,
                                        sess,
                                        adv_testloader,
                                        clean_acc,
                                        adv_accuracy,
                                        adv_loss,
                                        is_adv_example,
                                        ARGS,
                                        save_loss_dist=False,
                                        verbose=verbose)
        adv_worstcase = adv_mean
    else:
        raise NotImplementedError

    return clean, adv_mean, adv_worstcase
示例#14
0
def eval(sess,
         model_name,
         X_train,
         Y_train,
         X_test,
         Y_test,
         cnn=False,
         rbf=False):
    """ Load model saved in model_name.json and model_name_weights.h5 and 
    evaluate its accuracy on legitimate test samples and adversarial samples.
    Use cnn=True if the model is CNN based.
    """

    # load saved model
    print("Load model ... ")
    '''
    json = open('models/{}.json'.format(model_name), 'r')
    model = json.read()
    json.close()
    loaded_model = model_from_json(model)
    loaded_model.load_weights("models/{}_weights.h5".format(model_name))
    '''
    if rbf:
        loaded_model = load_model("rbfmodels/{}.h5".format(model_name),
                                  custom_objects={'RBFLayer': RBFLayer})
    else:
        loaded_model = load_model("models/{}.h5".format(model_name))

    # Set placeholders
    if cnn:
        x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    else:
        x = tf.placeholder(tf.float32, shape=(None, 784))

    y = tf.placeholder(tf.float32, shape=(None, 10))

    predictions = loaded_model(x)

    accuracy = model_eval(sess,
                          x,
                          y,
                          predictions,
                          X_test,
                          Y_test,
                          args={"batch_size": 128})
    print('Test accuracy on legitimate test examples: ' + str(accuracy))

    # Craft adversarial examples using Fast Gradient Sign Method (FGSM)
    # Using functions from /cleverhans/attacks_tf.py
    # Will be deprecated next year
    # adv_x = fgsm(x, predictions, eps=0.3)
    # X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], batch_size=128)

    # Using functions from /cleverhans/attacks.py (as specified by creators)

    wrap = KerasModelWrapper(loaded_model)
    spsa = SPSA(wrap, sess=sess)

    images = 100

    correctImages = 0
    adv_pred = np.zeros((images, 10))

    for i in range(images):
        tensorpls = X_test[i].reshape(1, 784)
        tensorpls2 = Y_test[i].reshape(1, 10)

        x_in = tf.convert_to_tensor(tensorpls, tf.float32)
        y_in = tf.convert_to_tensor(tensorpls2, tf.float32)

        adv_x = spsa.generate(x_in,
                              y_in,
                              eps=0.3,
                              nb_iter=100,
                              clip_min=0,
                              clip_max=1,
                              early_stop_loss_threshold=-1.,
                              spsa_samples=32,
                              spsa_iters=1)
        adv_x = tf.stop_gradient(adv_x)

        test2 = adv_x.eval(session=sess)
        test3 = test2.reshape(28, 28)
        plt.imshow(test3)
        plt.colorbar()
        plt.show()

        print(type(test2))
        print(test2.shape)

        preds_adv = loaded_model(adv_x)

        test = preds_adv.eval(session=sess)

        for j in range(10):
            adv_pred[i][j] = test[0][j]

        if np.argmax(adv_pred[i]) == np.argmax(Y_test[i]):
            correctImages = correctImages + 1

        accuracy = correctImages / (i + 1)
        print('Test accuracy (' + str(i + 1) + '): ' + str(accuracy))

    # Evaluate the accuracy of the MNIST model on adversarial examples
    #accuracy = model_eval(sess, x, y, preds_adv, X_test, Y_test, args={ "batch_size" : 128 })

    accuracy = correctImages / images
    print('Test accuracy on adversarial test examples: ' + str(accuracy))
示例#15
0
spsa_op = SPSA(cleverhans_model, sess=sess)
spsa_params = {'eps': 2.5,
             'clip_min': -2.3,
             'clip_max': 2.8, 
             'nb_iter': 40,
             'y': None}

correct = 0
count = 0
for xs, ys in val_subset_loader:
    count += 1
    ys = ys.numpy().astype(np.int32)
    # Create an SPSA attack
    spsa_params['y'] = ys
    adv_x = spsa_op.generate_np(xs, **spsa_params)
    break

# tf computational graph
count = 0
for xs, ys in val_subset_loader:
    count += 1
    print(count)
    # Create an SPSA attack
    spsa_params['y'] = ys
    adv_x_op = spsa_op.generate(x_op, **spsa_params)
    #adv_preds_op = tf_model_fn(adv_x_op)
    adv_x = sess.run(adv_x_op, feed_dict={x_op: xs})
    print(adv_x.shape)
    break
示例#16
0
def train(alpha, eps2_ratio, gen_ratio, fgsm_eps, LR, logfile):
    logfile.write("fgsm_eps \t %g, LR \t %g, alpha \t %d , eps2_ratio \t %d , gen_ratio \t %d \n"%(fgsm_eps, LR, alpha, eps2_ratio, gen_ratio))
    #############################
    ##Hyper-parameter Setting####
    #############################
    hk = 256; #number of hidden units at the last layer
    Delta2 = (14*14+2)*25; #global sensitivity for the first hidden layer
    Delta3_adv = 2*hk #10*(hk + 1/4 * hk**2) #10*(hk) #global sensitivity for the output layer
    Delta3_benign = 2*hk #10*(hk); #global sensitivity for the output layer
    D = 50000; #size of the dataset
    L = 2499; #batch size
    image_size = 28;
    padding = 4;
    #numHidUnits = 14*14*32 + 7*7*64 + M + 10; #number of hidden units
    #gen_ratio = 1
    epsilon1 = 0.0; #0.175; #epsilon for dpLRP
    epsilon2 = 0.1*(1 + gen_ratio); #epsilon for the first hidden layer
    epsilon3 = 0.1*(1); #epsilon for the last hidden layer
    total_eps = epsilon1 + epsilon2 + epsilon3
    print(total_eps)
    uncert = 0.1; #uncertainty modeling at the output layer
    infl = 1; #inflation rate in the privacy budget redistribution
    R_lowerbound = 1e-5; #lower bound of the LRP
    c = [0, 40, 50, 200] #norm bounds
    epochs = 200; #number of epochs
    preT_epochs = 50; #number of epochs
    T = int(D/L*epochs + 1); #number of steps T
    pre_T = int(D/L*preT_epochs + 1);
    step_for_epoch = int(D/L); #number of steps for one epoch
    
    broken_ratio = 1
    #alpha = 9.0 # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    #eps2_ratio = 10; # [1/10, 1/8, 1/6, 1/4, 1/2, 1, 2, 4, 6, 8, 10]
    #eps_benign = 1/(1+eps2_ratio)*(2*epsilon2)
    #eps_adv = eps2_ratio/(1+eps2_ratio)*(2*epsilon2)
    
    #fgsm_eps = 0.1
    rand_alpha = 0.05
    
    ##Robustness##
    robustness_T = (fgsm_eps*18*18*L*epsilon2)/Delta2;
    ####
    
    LRPfile = os.getcwd() + '/Relevance_R_0_075.txt';
    #############################
    mnist = input_data.read_data_sets("MNIST_data/", one_hot = True);

    #############################
    ##Construct the Model########
    #############################
    #Step 4: Randomly initiate the noise, Compute 1/|L| * Delta3 for the output layer#

    #Compute the 1/|L| * Delta3 for the last hidden layer#
    """eps3_ratio = Delta3_adv/Delta3_benign;
    eps3_benign = 1/(1+eps3_ratio)*(epsilon3)
    eps3_adv = eps3_ratio/(1+eps3_ratio)*(epsilon3)"""
    loc, scale3_benign, scale3_adv = 0., Delta3_benign/(epsilon3*L), Delta3_adv/(epsilon3*L);
    ###
    #End Step 4#
    # Parameters Declarification
    W_conv1 = weight_variable('W_conv1', [5, 5, 1, 32], collect=[AECODER_VARIABLES]);
    b_conv1 = bias_variable('b_conv1', [32], collect=[AECODER_VARIABLES]);

    shape     = W_conv1.get_shape().as_list()
    w_t       = tf.reshape(W_conv1, [-1, shape[-1]])
    w         = tf.transpose(w_t)
    sing_vals = tf.svd(w, compute_uv=False)
    sensitivity = tf.reduce_max(sing_vals)
    gamma = 2*(14*14 + 2)*25/(L*sensitivity)
    
    dp_epsilon=1.0 #0.1
    delta_r = fgsm_eps*(image_size**2);
    #delta_h = 1.0 * delta_r; #sensitivity*(14**2) = sensitivity*(\beta**2) can also be used
    #dp_mult = (Delta2/(L*epsilon2))/(delta_r / dp_epsilon) + (2*Delta2/(L*epsilon2))/(delta_h / dp_epsilon)
    
    W_conv2 = weight_variable('W_conv2', [5, 5, 32, 64], collect=[CONV_VARIABLES]);
    b_conv2 = bias_variable('b_conv2', [64], collect=[CONV_VARIABLES]);

    W_fc1 = weight_variable('W_fc1', [4 * 4 * 64, hk], collect=[CONV_VARIABLES]);
    b_fc1 = bias_variable('b_fc1', [hk], collect=[CONV_VARIABLES]);

    W_fc2 = weight_variable('W_fc2', [hk, 10], collect=[CONV_VARIABLES]);
    b_fc2 = bias_variable('b_fc2', [10], collect=[CONV_VARIABLES]);

    """scale2 = tf.Variable(tf.ones([hk]))
    beta2 = tf.Variable(tf.zeros([hk]))
    tf.add_to_collections([CONV_VARIABLES], scale2)
    tf.add_to_collections([CONV_VARIABLES], beta2)"""

    params = [W_conv1, b_conv1, W_conv2, b_conv2, W_fc1, b_fc1, W_fc2, b_fc2]
    ###


    #Step 5: Create the model#
    noise = tf.placeholder(tf.float32, [None, image_size, image_size, 1]);
    adv_noise = tf.placeholder(tf.float32, [None, image_size, image_size, 1]);

    keep_prob = tf.placeholder(tf.float32);
    x = tf.placeholder(tf.float32, [None, image_size*image_size]);
    x_image = tf.reshape(x, [-1,image_size,image_size,1]);

    #perturbFMx = np.random.laplace(0.0, Delta2/(2*epsilon2*L), 28*28)
    #perturbFMx = np.reshape(perturbFMx, [-1, 28, 28, 1]);

    # pretrain ###
    #Enc_Layer1 = EncLayer(inpt=x_image, n_filter_in = 1, n_filter_out = 32, filter_size = 5, W=W_conv1, b=b_conv1, activation=tf.nn.relu)
    #pretrain = Enc_Layer1.get_train_ops2(xShape = tf.shape(x_image)[0], Delta = Delta2, epsilon = 2*epsilon2, batch_size = L, learning_rate= LR, W = W_conv1, b = b_conv1, perturbFMx = noise)
    ###########

    adv_x = tf.placeholder(tf.float32, [None, image_size*image_size]);
    adv_image = tf.reshape(adv_x, [-1,image_size,image_size,1]);

    #perturbFMx_adv = np.random.laplace(0.0, Delta2/(2*epsilon2*L), 28*28)
    #perturbFMx_adv = np.reshape(perturbFMx_adv, [-1, 28, 28, 1]);

    # pretrain adv ###
    #perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2*L), 14*14*32)
    #perturbFM_h = np.reshape(perturbFM_h, [-1, 14, 14, 32]);
    FM_h = tf.placeholder(tf.float32, [None, 14, 14, 32]);
    Enc_Layer2 = EncLayer(inpt=adv_image, n_filter_in = 1, n_filter_out = 32, filter_size = 5, W=W_conv1, b=b_conv1, activation=tf.nn.relu)
    pretrain_adv = Enc_Layer2.get_train_ops2(xShape = tf.shape(adv_image)[0], Delta = Delta2, batch_size = L, learning_rate= LR, W = W_conv1, b = b_conv1, perturbFMx = adv_noise, perturbFM_h = FM_h)
    Enc_Layer3 = EncLayer(inpt=x_image, n_filter_in = 1, n_filter_out = 32, filter_size = 5, W=W_conv1, b=b_conv1, activation=tf.nn.relu)
    pretrain_benign = Enc_Layer3.get_train_ops2(xShape = tf.shape(x_image)[0], Delta = Delta2, batch_size = L, learning_rate= LR, W = W_conv1, b = b_conv1, perturbFMx = noise, perturbFM_h = FM_h)
    ###########
    
    x_image += noise;
    x_image = tf.clip_by_value(x_image, -10, 10) #Clip the values of each input feature.
    
    adv_image += adv_noise;
    adv_image = tf.clip_by_value(adv_image, -10, 10) #Clip the values of each input feature.

    #perturbFM = np.random.laplace(0.0, scale3_benign, hk)
    #perturbFM = np.reshape(perturbFM, [hk]);
    perturbFM = np.random.laplace(0.0, scale3_benign, hk * 10)
    perturbFM = np.reshape(perturbFM, [hk, 10]);
    
    y_conv = inference(x_image, perturbFM, hk, FM_h, params);
    softmax_y_conv = tf.nn.softmax(y_conv)
    #robust_mask = inference_robust_mask(y_conv, Delta2, L, epsilon2, robustness_T)

    #perturbFM = np.random.laplace(0.0, scale3_adv, hk)
    #perturbFM = np.reshape(perturbFM, [hk]);
    y_adv_conv = inference(adv_image, perturbFM, hk, FM_h, params);
    #adv_robust_mask = inference_robust_mask(y_adv_conv, Delta2, L, epsilon2, robustness_T)

    # test model
    perturbFM_test = np.random.laplace(0.0, 0, hk)
    perturbFM_test = np.reshape(perturbFM_test, [hk]);
    x_test = tf.reshape(x, [-1,image_size,image_size,1]);
    y_test = inference(x_test, perturbFM_test, hk, FM_h, params);
    #test_robust_mask = inference_robust_mask(y_test, Delta2, L, epsilon2, robustness_T)

    #Define a place holder for the output label#
    y_ = tf.placeholder(tf.float32, [None, 10]);
    adv_y_ = tf.placeholder(tf.float32, [None, 10]);
    #End Step 5#
    #############################

    #############################
    ##Define loss and Optimizer##
    #############################
    '''
        Computes differentially private sigmoid cross entropy given `logits`.
        
        Measures the probability error in discrete classification tasks in which each
        class is independent and not mutually exclusive.
        
        For brevity, let `x = logits`, `z = labels`.  The logistic loss is
        z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
        = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
        = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
        = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
        = (1 - z) * x + log(1 + exp(-x))
        = x - x * z + log(1 + exp(-x))
        
        For x < 0, to avoid overflow in exp(-x), we reformulate the above
        
        x - x * z + log(1 + exp(-x))
        = log(exp(x)) - x * z + log(1 + exp(-x))
        = - x * z + log(1 + exp(x))
        
        Hence, to ensure stability and avoid overflow, the implementation uses this
        equivalent formulation
        
        max(x, 0) - x * z + log(1 + exp(-abs(x)))
        
        `logits` and `labels` must have the same type and shape. Let denote neg_abs_logits = -abs(y_conv) = -abs(h_fc1 * W_fc2). By Applying Taylor Expansion, we have:
        
        Taylor = max(y_conv, 0) - y_conv * y_ + log(1 + exp(-abs(y_conv)));
        = max(h_fc1 * W_fc2, 0) - (y_ * h_fc1) * W_fc2 + (math.log(2.0) + 0.5*neg_abs_logits + 1.0/8.0*neg_abs_logits**2)
        = max(h_fc1 * W_fc2, 0) - (y_ * h_fc1) * W_fc2 + (math.log(2.0) + 0.5*(-abs(h_fc1 * W_fc2)) + 1.0/8.0*(-abs(h_fc1 * W_fc2))**2)
        = F1 + F2
        where: F1 = max(h_fc1 * W_fc2, 0) + (math.log(2.0) + 0.5*(-abs(h_fc1 * W_fc2)) + 1.0/8.0*(-abs(h_fc1 * W_fc2))**2) and F2 = - (y_ * h_fc1) * W_fc2
        
        To ensure that Taylor is differentially private, we need to perturb all the coefficients, including the term y_ * h_fc1 * W_fc2.
        Note that h_fc1 is differentially private, since its computation on top of the DP Affine transformation does not access the original data.
        Therefore, F1 should be differentially private. We need to preserve DP in F2, which reads the groundtruth label y_, as follows:
        
        By applying Funtional Mechanism, we perturb (y_ * h_fc1) * W_fc2 as ((y_ * h_fc1) + perturbFM) * W_fc2 = (y_ * h_fc1)*W_fc2 + (perturbFM * W_fc2):
        
        perturbFM = np.random.laplace(0.0, scale3, hk * 10)
        perturbFM = np.reshape(perturbFM/L, [hk, 10]);
        
        where scale3 = Delta3/(epsilon3) = 2*hk/(epsilon3);
        
        To allow computing gradients at zero, we define custom versions of max and abs functions [Tensorflow].
        
        Source: https://github.com/tensorflow/tensorflow/blob/r1.4/tensorflow/python/ops/nn_impl.py @ TensorFlow
    '''
    ### Taylor for benign x
    zeros = array_ops.zeros_like(y_conv, dtype=y_conv.dtype)
    cond = (y_conv >= zeros)
    relu_logits = array_ops.where(cond, y_conv, zeros)
    neg_abs_logits = array_ops.where(cond, -y_conv, y_conv)
    #Taylor = math_ops.add(relu_logits - y_conv * y_, math_ops.log1p(math_ops.exp(neg_abs_logits)))
    Taylor_benign = math_ops.add(relu_logits - y_conv * y_, math.log(2.0) + 0.5*neg_abs_logits + 1.0/8.0*neg_abs_logits**2) - tf.reduce_sum(perturbFM*W_fc2)
    #Taylor_benign = tf.abs(y_conv - y_)

    ### Taylor for adv_x
    zeros_adv = array_ops.zeros_like(y_adv_conv, dtype=y_conv.dtype)
    cond_adv = (y_adv_conv >= zeros_adv)
    relu_logits_adv = array_ops.where(cond_adv, y_adv_conv, zeros_adv)
    neg_abs_logits_adv = array_ops.where(cond_adv, -y_adv_conv, y_adv_conv)
    #Taylor = math_ops.add(relu_logits - y_conv * y_, math_ops.log1p(math_ops.exp(neg_abs_logits)))
    Taylor_adv = math_ops.add(relu_logits_adv - y_adv_conv * adv_y_, math.log(2.0) + 0.5*neg_abs_logits_adv + 1.0/8.0*neg_abs_logits_adv**2) - tf.reduce_sum(perturbFM*W_fc2)
    #Taylor_adv = tf.abs(y_adv_conv - adv_y_)

    ### Adversarial training loss
    adv_loss = (1/(L + L*alpha))*(Taylor_benign + alpha * Taylor_adv)

    '''Some time, using learning rate decay can help to stablize training process. However, use this carefully, since it may affect the convergent speed.'''
    global_step = tf.Variable(0, trainable=False)
    pretrain_var_list = tf.get_collection(AECODER_VARIABLES)
    train_var_list = tf.get_collection(CONV_VARIABLES)
    #print(pretrain_var_list)
    #print(train_var_list)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        pretrain_step = tf.train.AdamOptimizer(LR).minimize(pretrain_adv+pretrain_benign, global_step=global_step, var_list=pretrain_var_list);
        train_step = tf.train.AdamOptimizer(LR).minimize(adv_loss, global_step=global_step, var_list=train_var_list);
    sess = tf.InteractiveSession();

    # Define the correct prediction and accuracy
    # This needs to be changed to "Robust Prediction"
    correct_prediction_x = tf.equal(tf.argmax(y_test,1), tf.argmax(y_,1));
    accuracy_x = tf.reduce_mean(tf.cast(correct_prediction_x, tf.float32));

    #############
    # use these to get predictions wrt to robust conditions
    """robust_correct_prediction_x = tf.multiply(test_robust_mask, tf.cast(correct_prediction_x, tf.float32))
    accuracy_x_robust = tf.reduce_sum(robust_correct_prediction_x) / tf.reduce_sum(test_robust_mask)
    #certified_utility = 2/(1/accuracy_x_robust + 1/(tf.reduce_sum(test_robust_mask)/(1.0*tf.cast(tf.size(test_robust_mask), tf.float32))))
    certified_utility = (1.0*tf.reduce_sum(test_robust_mask))/(1.0*tf.cast(tf.size(test_robust_mask), tf.float32))"""
    #############

    # craft adversarial samples from x for training
    dynamic_eps = tf.placeholder(tf.float32);
    emsemble_L = int(L/3)
    softmax_y = tf.nn.softmax(y_test)
    #c_x_adv = fgsm(x, softmax_y, eps=fgsm_eps, clip_min=0.0, clip_max=1.0)
    c_x_adv = fgsm(x, softmax_y, eps=(dynamic_eps)/10, clip_min=-1.0, clip_max=1.0) # for I-FGSM
    x_adv = tf.reshape(c_x_adv, [emsemble_L,image_size*image_size]);

    #====================== attack =========================
    #attack_switch = {'randfgsm':True, 'fgsm':True, 'ifgsm':True, 'deepfool':True, 'mim':True, 'spsa':False, 'cwl2':False, 'madry':True, 'stm':True}
    #attack_switch = {'fgsm':True, 'ifgsm':True, 'deepfool':True, 'mim':True, 'spsa':False, 'cwl2':False, 'madry':True, 'stm':True}
    attack_switch = {'fgsm':True, 'ifgsm':True, 'deepfool':False, 'mim':True, 'spsa':False, 'cwl2':False, 'madry':True, 'stm':False}
    #other possible attacks:
        # ElasticNetMethod
        # FastFeatureAdversaries
        # LBFGS
        # SaliencyMapMethod
        # VirtualAdversarialMethod

    # y_test = logits (before softmax)
    # softmax_y_test = preds (probs, after softmax)
    softmax_y_test = tf.nn.softmax(y_test)

    # create saver
    saver = tf.train.Saver(tf.all_variables())
    
    sess.run(W_conv1.initializer)
    _gamma = sess.run(gamma)
    _gamma_x = Delta2/L
    epsilon2_update = epsilon2/(1.0 + 1.0/_gamma + 1/_gamma_x)
    print(epsilon2_update/_gamma + epsilon2_update/_gamma_x)
    print(epsilon2_update)
    _sensitivityW = sess.run(sensitivity)
    delta_h = _sensitivityW*(14**2)
    dp_mult = (Delta2/(L*epsilon2_update))/(delta_r / dp_epsilon) + (2*Delta2/(L*epsilon2_update))/(delta_h / dp_epsilon)
    #############################
    
    iterativeStep = 100
    
    # load the most recent models
    _global_step = 0
    ckpt = tf.train.get_checkpoint_state(os.getcwd() + './tmp/train')
    if ckpt and ckpt.model_checkpoint_path:
        print(ckpt.model_checkpoint_path);
        saver.restore(sess, ckpt.model_checkpoint_path)
        _global_step = int(ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1])
    else:
        print('No checkpoint file found')

    start_time = time.time();

    # adv pretrain model (Auto encoder layer)
    cost = tf.reduce_sum(Enc_Layer2.cost);
    logfile.write("pretrain: \n")
    
    # define cleverhans abstract models for using cleverhans attacks
    ch_model_logits = CustomCallableModelWrapper(callable_fn=inference_test_input, output_layer='logits', hk=hk, params=params, image_size=image_size, adv_noise = adv_noise)
    ch_model_probs = CustomCallableModelWrapper(callable_fn=inference_test_input_probs, output_layer='probs', hk=hk, params=params, image_size=image_size, adv_noise = adv_noise)

    # rand+fgsm
    # if attack_switch['randfgsm']:
    #     randfgsm_obj = FastGradientMethod(model=ch_model_probs, sess=sess)
    #     x_randfgsm_t = (fgsm_eps - rand_alpha) * randfgsm_obj.generate(x=x, eps=fgsm_eps, clip_min=-1.0, clip_max=1.0)
    #     x_rand_t = rand_alpha * tf.sign(tf.random_normal(shape=tf.shape(x), mean=0.0, stddev=1.0))

    # define each attack method's tensor
    mu_alpha = tf.placeholder(tf.float32, [1]);
    attack_tensor_dict = {}
    # FastGradientMethod
    if attack_switch['fgsm']:
        print('creating attack tensor of FastGradientMethod')
        fgsm_obj = FastGradientMethod(model=ch_model_probs, sess=sess)
        #x_adv_test_fgsm = fgsm_obj.generate(x=x, eps=fgsm_eps, clip_min=-1.0, clip_max=1.0, ord=2) # testing now
        x_adv_test_fgsm = fgsm_obj.generate(x=x, eps=mu_alpha, clip_min=-1.0, clip_max=1.0) # testing now
        attack_tensor_dict['fgsm'] = x_adv_test_fgsm

    # Iterative FGSM (BasicIterativeMethod/ProjectedGradientMethod with no random init)
    # default: eps_iter=0.05, nb_iter=10
    if attack_switch['ifgsm']:
        print('creating attack tensor of BasicIterativeMethod')
        ifgsm_obj = BasicIterativeMethod(model=ch_model_probs, sess=sess)
        #x_adv_test_ifgsm = ifgsm_obj.generate(x=x, eps=fgsm_eps, eps_iter=fgsm_eps/10, nb_iter=10, clip_min=-1.0, clip_max=1.0, ord=2)
        x_adv_test_ifgsm = ifgsm_obj.generate(x=x, eps=mu_alpha, eps_iter=mu_alpha/iterativeStep, nb_iter=iterativeStep, clip_min=-1.0, clip_max=1.0)
        attack_tensor_dict['ifgsm'] = x_adv_test_ifgsm

    # Deepfool
    if attack_switch['deepfool']:
        print('creating attack tensor of DeepFool')
        deepfool_obj = DeepFool(model=ch_model_logits, sess=sess)
        #x_adv_test_deepfool = deepfool_obj.generate(x=x, nb_candidate=10, overshoot=0.02, max_iter=50, nb_classes=10, clip_min=-1.0, clip_max=1.0, ord=2)
        x_adv_test_deepfool = deepfool_obj.generate(x=x, nb_candidate=10, overshoot=0.02, max_iter=50, nb_classes=10, clip_min=-1.0, clip_max=1.0)
        attack_tensor_dict['deepfool'] = x_adv_test_deepfool

    # MomentumIterativeMethod
    # default: eps_iter=0.06, nb_iter=10
    if attack_switch['mim']:
        print('creating attack tensor of MomentumIterativeMethod')
        mim_obj = MomentumIterativeMethod(model=ch_model_probs, sess=sess)
        #x_adv_test_mim = mim_obj.generate(x=x, eps=fgsm_eps, eps_iter=fgsm_eps/10, nb_iter=10, decay_factor=1.0, clip_min=-1.0, clip_max=1.0, ord=2)
        x_adv_test_mim = mim_obj.generate(x=x, eps=mu_alpha, eps_iter=mu_alpha/iterativeStep, nb_iter=iterativeStep, decay_factor=1.0, clip_min=-1.0, clip_max=1.0)
        attack_tensor_dict['mim'] = x_adv_test_mim

    # SPSA
    # note here the epsilon is the infinity norm instead of precent of perturb
    # Maybe exclude this method first, since it seems to have some constrain about the data value range
    if attack_switch['spsa']:
        print('creating attack tensor of SPSA')
        spsa_obj = SPSA(model=ch_model_logits, sess=sess)
        #x_adv_test_spsa = spsa_obj.generate(x=x, epsilon=fgsm_eps, num_steps=10, is_targeted=False, early_stop_loss_threshold=None, learning_rate=0.01, delta=0.01,spsa_samples=1000, spsa_iters=1, ord=2)
        x_adv_test_spsa = spsa_obj.generate(x=x, epsilon=fgsm_eps, num_steps=10, is_targeted=False, early_stop_loss_threshold=None, learning_rate=0.01, delta=0.01,spsa_samples=1000, spsa_iters=1)
        attack_tensor_dict['spsa'] = x_adv_test_spsa

    # CarliniWagnerL2
    # confidence=0 is fron their paper
    # it is said to be slow, maybe exclude first
    if attack_switch['cwl2']:
        print('creating attack tensor of CarliniWagnerL2')
        cwl2_obj = CarliniWagnerL2(model=ch_model_logits, sess=sess)
        #x_adv_test_cwl2 = cwl2_obj.generate(x=x, confidence=0, batch_size=1000, learning_rate=0.005, binary_search_steps=5, max_iterations=500, abort_early=True, initial_const=0.01, clip_min=-1.0, clip_max=1.0, ord=2)
        x_adv_test_cwl2 = cwl2_obj.generate(x=x, confidence=0, batch_size=1000, learning_rate=0.005, binary_search_steps=5, max_iterations=500, abort_early=True, initial_const=0.01, clip_min=-1.0, clip_max=1.0)
        attack_tensor_dict['cwl2'] = x_adv_test_cwl2

    # MadryEtAl (Projected Grdient with random init, same as rand+fgsm)
    # default: eps_iter=0.01, nb_iter=40
    if attack_switch['madry']:
        print('creating attack tensor of MadryEtAl')
        madry_obj = MadryEtAl(model=ch_model_probs, sess=sess)
        #x_adv_test_madry = madry_obj.generate(x=x, eps=fgsm_eps, eps_iter=fgsm_eps/10, nb_iter=10, clip_min=-1.0, clip_max=1.0, ord=2)
        x_adv_test_madry = madry_obj.generate(x=x, eps=mu_alpha, eps_iter=fgsm_eps/iterativeStep, nb_iter=iterativeStep, clip_min=-1.0, clip_max=1.0)
        attack_tensor_dict['madry'] = x_adv_test_madry

    # SpatialTransformationMethod
    # the params are pretty different from on the paper
    # so I use default
    # exclude since there's bug
    if attack_switch['stm']:
        print('creating attack tensor of SpatialTransformationMethod')
        stm_obj = SpatialTransformationMethod(model=ch_model_probs, sess=sess)
        #x_adv_test_stm = stm_obj.generate(x=x, batch_size=1000, n_samples=None, dx_min=-0.1, dx_max=0.1, n_dxs=2, dy_min=-0.1, dy_max=0.1, n_dys=2, angle_min=-30, angle_max=30, n_angles=6, ord=2)
        x_adv_test_stm = stm_obj.generate(x=x, batch_size=1000, n_samples=None, dx_min=-0.1, dx_max=0.1, n_dxs=2, dy_min=-0.1, dy_max=0.1, n_dys=2, angle_min=-30, angle_max=30, n_angles=6)
        attack_tensor_dict['stm'] = x_adv_test_stm
    #====================== attack =========================
    
    sess.run(tf.initialize_all_variables());

    ##perturb h for training
    perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2_update*L), 14*14*32)
    perturbFM_h = np.reshape(perturbFM_h, [-1, 14, 14, 32]);

    ##perturb h for testing
    perturbFM_h_test = np.random.laplace(0.0, 0, 14*14*32)
    perturbFM_h_test = np.reshape(perturbFM_h_test, [-1, 14, 14, 32]);

    '''for i in range(_global_step, _global_step + pre_T):
        d_eps = random.random();
        
        batch = mnist.train.next_batch(L); #Get a random batch.
        adv_images = sess.run(x_adv, feed_dict = {x:batch[0], y_:batch[1], FM_h: perturbFM_h_test, dynamic_eps: d_eps})
        for iter in range(0, 9):
            adv_images = sess.run(x_adv, feed_dict = {x:adv_images, y_:batch[1], FM_h: perturbFM_h_test, dynamic_eps: d_eps})
        """batch = mnist.train.next_batch(emsemble_L)
        adv_images_mim = sess.run(attack_tensor_dict['mim'], feed_dict = {x:batch[0], y_: batch[1]})
        batch = mnist.train.next_batch(emsemble_L)
        adv_images_madry = sess.run(attack_tensor_dict['mim'], feed_dict = {x:batch[0], y_: batch[1]})
        train_images = np.append(np.append(adv_images, adv_images_mim, axis = 0),adv_images_madry, axis = 0)"""

        batch_2 = mnist.train.next_batch(L);
        pretrain_step.run(feed_dict={adv_x: np.append(adv_images, batch_2[0], axis = 0), adv_noise: AdvLnoise, FM_h: perturbFM_h});
        if i % int(5*step_for_epoch) == 0:
            cost_value = sess.run(cost, feed_dict={adv_x:mnist.test.images, adv_noise: AdvLnoise_test, FM_h: perturbFM_h_test})/(test_size*32)
            logfile.write("step \t %d \t %g \n"%(i, cost_value))
            print(cost_value)

    pre_train_finish_time = time.time()
    print('pre_train finished in: ' + parse_time(pre_train_finish_time - start_time))'''

    # train and test model with adv samples
    max_benign_acc = -1;
    max_robust_benign_acc = -1
    #max_adv_acc = -1;

    test_size = len(mnist.test.images)
    AdvLnoise = generateIdLMNoise(image_size, Delta2, epsilon2_update, L);
    AdvLnoise_test = generateIdLMNoise(image_size, 0, epsilon2_update, test_size);

    Lnoise_empty = generateIdLMNoise(image_size, 0, epsilon2_update, L);
    BenignLNoise = generateIdLMNoise(image_size, Delta2, epsilon2_update, L);
    last_eval_time = -1
    accum_time = 0
    accum_epoch = 0
    max_adv_acc_dict = {}
    max_robust_adv_acc_dict = {}
    #max_robust_adv_utility_dict = {}
    for atk in attack_switch.keys():
        if atk not in max_adv_acc_dict:
            max_adv_acc_dict[atk] = -1
            max_robust_adv_acc_dict[atk] = -1

    for i in range(_global_step, _global_step + T):
        # this batch is for generating adv samples
        batch = mnist.train.next_batch(emsemble_L); #Get a random batch.
        y_adv_batch = batch[1]
        #The number of epochs we print out the result. Print out the result every 5 epochs.
        if i % int(10*step_for_epoch) == 0 and i > int(10*step_for_epoch):
            cost_value = sess.run(cost, feed_dict={adv_x:mnist.test.images, adv_noise: AdvLnoise_test, FM_h: perturbFM_h_test})/(test_size*32)
            print(cost_value)
            
            if last_eval_time < 0:
                last_eval_time = time.time()
            #===================benign samples=====================
            predictions_form_argmax = np.zeros([test_size, 10])
            #test_bach = mnist.test.next_batch(test_size)
            softmax_predictions = softmax_y_conv.eval(feed_dict={x: mnist.test.images, noise: BenignLNoise, FM_h: perturbFM_h})
            argmax_predictions = np.argmax(softmax_predictions, axis=1)
            for n_draws in range(0, 1):
                _BenignLNoise = generateIdLMNoise(image_size, Delta2, epsilon2_update, L);
                _perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2_update*L), 14*14*32)
                _perturbFM_h = np.reshape(_perturbFM_h, [-1, 14, 14, 32]);
                for j in range(test_size):
                    pred = argmax_predictions[j]
                    predictions_form_argmax[j, pred] += 1;
                softmax_predictions = softmax_y_conv.eval(feed_dict={x: mnist.test.images, noise: (BenignLNoise + _BenignLNoise/2), FM_h: (perturbFM_h + _perturbFM_h/2)})
                argmax_predictions = np.argmax(softmax_predictions, axis=1)
            final_predictions = predictions_form_argmax;
            is_correct = []
            is_robust = []
            for j in range(test_size):
                is_correct.append(np.argmax(mnist.test.labels[j]) == np.argmax(final_predictions[j]))
                robustness_from_argmax = robustness.robustness_size_argmax(counts=predictions_form_argmax[j],eta=0.05,dp_attack_size=fgsm_eps, dp_epsilon=1.0, dp_delta=0.05, dp_mechanism='laplace') / (dp_mult)
                is_robust.append(robustness_from_argmax >= fgsm_eps)
            acc = np.sum(is_correct)*1.0/test_size
            robust_acc = np.sum([a and b for a,b in zip(is_robust, is_correct)])*1.0/np.sum(is_robust)
            robust_utility = np.sum(is_robust)*1.0/test_size
            max_benign_acc = max(max_benign_acc, acc)
            max_robust_benign_acc = max(max_robust_benign_acc, robust_acc*robust_utility)
            log_str = "step: {:.1f}\t epsilon: {:.1f}\t benign: {:.4f} \t {:.4f} \t {:.4f} \t {:.4f} \t".format(i, total_eps, acc, robust_acc, robust_utility, robust_acc*robust_utility)
            #===================adv samples=====================
            #log_str = "step: {:.1f}\t epsilon: {:.1f}\t".format(i, total_eps)
            """adv_images_dict = {}
            for atk in attack_switch.keys():
                if attack_switch[atk]:
                    adv_images_dict[atk] = sess.run(attack_tensor_dict[atk], feed_dict = {x:mnist.test.images, y_:mnist.test.labels})
            print("Done with the generating of Adversarial samples")"""
            #===================adv samples=====================
            adv_acc_dict = {}
            robust_adv_acc_dict = {}
            robust_adv_utility_dict = {}
            for atk in attack_switch.keys():
                if atk not in adv_acc_dict:
                    adv_acc_dict[atk] = -1
                    robust_adv_acc_dict[atk] = -1
                    robust_adv_utility_dict[atk] = -1
                if attack_switch[atk]:
                    adv_images_dict = sess.run(attack_tensor_dict[atk], feed_dict = {x:mnist.test.images, y_: mnist.test.labels, adv_noise: AdvLnoise_test, mu_alpha:[fgsm_eps]})
                    ### PixelDP Robustness ###
                    predictions_form_argmax = np.zeros([test_size, 10])
                    softmax_predictions = softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: BenignLNoise, FM_h: perturbFM_h})
                    argmax_predictions = np.argmax(softmax_predictions, axis=1)
                    for n_draws in range(0, 2000):
                        if n_draws % 1000 == 0:
                            print(n_draws)
                        _BenignLNoise = generateIdLMNoise(image_size, Delta2, epsilon2_update, L);
                        _perturbFM_h = np.random.laplace(0.0, 2*Delta2/(epsilon2_update*L), 14*14*32)
                        _perturbFM_h = np.reshape(_perturbFM_h, [-1, 14, 14, 32]);
                        for j in range(test_size):
                            pred = argmax_predictions[j]
                            predictions_form_argmax[j, pred] += 1;
                        softmax_predictions = softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: BenignLNoise, FM_h: (perturbFM_h + _perturbFM_h/2)}) * softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: (BenignLNoise + _BenignLNoise/2), FM_h: perturbFM_h})
                        #softmax_predictions = softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: BenignLNoise, FM_h: (_perturbFM_h)}) * softmax_y_conv.eval(feed_dict={x: adv_images_dict, noise: (_BenignLNoise), FM_h: perturbFM_h})
                        argmax_predictions = np.argmax(softmax_predictions, axis=1)
                    final_predictions = predictions_form_argmax;
                    is_correct = []
                    is_robust = []
                    for j in range(test_size):
                        is_correct.append(np.argmax(mnist.test.labels[j]) == np.argmax(final_predictions[j]))
                        robustness_from_argmax = robustness.robustness_size_argmax(counts=predictions_form_argmax[j],eta=0.05,dp_attack_size=fgsm_eps, dp_epsilon=1.0, dp_delta=0.05, dp_mechanism='laplace') / (dp_mult)
                        is_robust.append(robustness_from_argmax >= fgsm_eps)
                    adv_acc_dict[atk] = np.sum(is_correct)*1.0/test_size
                    robust_adv_acc_dict[atk] = np.sum([a and b for a,b in zip(is_robust, is_correct)])*1.0/np.sum(is_robust)
                    robust_adv_utility_dict[atk] = np.sum(is_robust)*1.0/test_size
                    ##############################
            for atk in attack_switch.keys():
                if attack_switch[atk]:
                    # added robust prediction
                    log_str += " {}: {:.4f} {:.4f} {:.4f} {:.4f}".format(atk, adv_acc_dict[atk], robust_adv_acc_dict[atk], robust_adv_utility_dict[atk], robust_adv_acc_dict[atk]*robust_adv_utility_dict[atk])
                    max_adv_acc_dict[atk] = max(max_adv_acc_dict[atk], adv_acc_dict[atk])
                    max_robust_adv_acc_dict[atk] = max(max_robust_adv_acc_dict[atk], robust_adv_acc_dict[atk]*robust_adv_utility_dict[atk])
            print(log_str)
            logfile.write(log_str + '\n')

            # logfile.write("step \t %d \t %g \t %g \n"%(i, benign_acc, adv_acc))
            # print("step \t %d \t %g \t %g"%(i, benign_acc, adv_acc));

            # estimate end time
            """if i > 0 and i % int(10*step_for_epoch) == 0:
                current_time_interval = time.time() - last_eval_time
                last_eval_time = time.time()
                print('during last eval interval, {} epoch takes {}'.format(10, parse_time(current_time_interval)))
                accum_time += current_time_interval
                accum_epoch += 10
                estimate_time = ((_global_step + T - i) / step_for_epoch) * (accum_time / accum_epoch)
                print('estimate finish in: {}'.format(parse_time(estimate_time)))"""

            #print("step \t %d \t adversarial test accuracy \t %g"%(i, accuracy_x.eval(feed_dict={x: adv_images, y_: mnist.test.labels, noise: Lnoise_empty})));
            """checkpoint_path = os.path.join(os.getcwd() + '/tmp/train', 'model.ckpt')
            saver.save(sess, checkpoint_path, global_step=i);"""

        d_eps = random.random();
        y_adv = batch[1]
        adv_images = sess.run(attack_tensor_dict['ifgsm'], feed_dict = {x:batch[0], y_: batch[1], adv_noise: AdvLnoise, mu_alpha:[d_eps]})
        """for iter in range(0, 9):
            adv_images = sess.run(x_adv, feed_dict = {x:adv_images, y_:batch[1], FM_h: perturbFM_h_test, dynamic_eps: d_eps})"""
        batch = mnist.train.next_batch(emsemble_L)
        adv_images_mim = sess.run(attack_tensor_dict['mim'], feed_dict = {x:batch[0], y_: batch[1], adv_noise: AdvLnoise, mu_alpha:[d_eps]})
        y_adv = np.append(y_adv, batch[1], axis = 0)
        batch = mnist.train.next_batch(emsemble_L)
        adv_images_madry = sess.run(attack_tensor_dict['madry'], feed_dict = {x:batch[0], y_: batch[1], adv_noise: AdvLnoise, mu_alpha:[d_eps]})
        y_adv = np.append(y_adv, batch[1], axis = 0)
        train_images = np.append(np.append(adv_images, adv_images_mim, axis = 0),adv_images_madry, axis = 0)
        
        batch = mnist.train.next_batch(L); #Get a random batch.
        # train with benign and adv samples
        pretrain_step.run(feed_dict={adv_x: train_images, x: batch[0], adv_noise: AdvLnoise_test, noise: BenignLNoise, FM_h: perturbFM_h});
        train_step.run(feed_dict={x: batch[0], adv_x: train_images, y_: batch[1], adv_y_: y_adv, noise: BenignLNoise, adv_noise: AdvLnoise_test, FM_h: perturbFM_h});
    duration = time.time() - start_time;
    # print(parse_time(duration)); #print running time duration#

    max_acc_string = "max acc: benign: \t{:.4f} {:.4f}".format(max_benign_acc, max_robust_benign_acc)
    for atk in attack_switch.keys():
        if attack_switch[atk]:
            max_acc_string += " {}: \t{:.4f} {:.4f}".format(atk, max_adv_acc_dict[atk], max_robust_adv_acc_dict[atk])
    logfile.write(max_acc_string + '\n')
    logfile.write(str(duration) + '\n')