Exemplo n.º 1
0
def attack(X, y, batch_size=128, thresh=0.3, target=-1):
	x_pl = tf.placeholder(tf.float32, [None, X.shape[1], X.shape[2], X.shape[3]]) # image placeholder
	t = tf.placeholder(tf.float32, [None, 10]) # target placeholder
	is_training = tf.placeholder(tf.bool, [])

	is_targeted = False
	if target in range(0, y.shape[-1]):
		is_targeted = True

	perturb = tf.clip_by_value(generator(x_pl, is_training), -thresh, thresh)
	x_perturbed = perturb + x_pl
	x_perturbed = tf.clip_by_value(x_perturbed, 0, 1)

	f = target_model()
	f_real_logits, f_real_probs = f.ModelC(x_pl)
	f_fake_logits, f_fake_probs = f.ModelC(x_perturbed)

	t_vars = tf.trainable_variables()
	f_vars = [var for var in t_vars if 'ModelC' in var.name]
	g_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='g_weights')

	sess = tf.Session()

	f_saver = tf.train.Saver(f_vars)
	g_saver = tf.train.Saver(g_vars)
	f_saver.restore(sess, "./weights/target_model/model.ckpt")
	g_saver.restore(sess, tf.train.latest_checkpoint("./weights/generator/"))

	rawpert, pert, fake_l, real_l = sess.run([perturb, x_perturbed, f_fake_probs, f_real_probs], \
												feed_dict={x_pl: X[:32], \
														   is_training: False})
	print('LA: ' + str(np.argmax(y[:32], axis=1)))
	print('OG: ' + str(np.argmax(real_l, axis=1)))
	print('PB: ' + str(np.argmax(fake_l, axis=1)))

	correct_prediction = tf.equal(tf.argmax(f_fake_probs, 1), tf.argmax(t, 1))
	accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
	accs = []
	total_batches_test = int(X.shape[0] / batch_size)
	for i in range(total_batches_test):
		batch_x, batch_y = next_batch(X, y, i, batch_size)

		if is_targeted:
			targets = np.full((batch_y.shape[0],), target)
			batch_y = np.eye(y.shape[-1])[targets]

		acc, fake_l, x_pert = sess.run([accuracy, f_fake_probs, x_perturbed], feed_dict={x_pl: batch_x, t: batch_y, is_training: False})
		accs.append(acc)

	print('accuracy of test set: {}'.format(sum(accs) / len(accs)))

	f, axarr = plt.subplots(2,2)
	axarr[0,0].imshow(np.squeeze(X[3]), cmap='Greys_r')
	axarr[0,1].imshow(np.squeeze(pert[3]), cmap='Greys_r')
	axarr[1,0].imshow(np.squeeze(X[4]), cmap='Greys_r')
	axarr[1,1].imshow(np.squeeze(pert[4]), cmap='Greys_r')
	plt.show()
Exemplo n.º 2
0
def attack(X, y):
    x_pl = tf.placeholder(tf.float32, [None, 28, 28, 1])  # image placeholder

    perturb = generator(x_pl)

    x_perturbed = x_pl + perturb

    d_perturb_logits, d_perturb_probs = discriminator(x_perturbed)

    f = target_model()
    f_real_logits, f_real_probs = f.ModelC(x_pl)
    f_fake_logits, f_fake_probs = f.ModelC(x_perturbed)

    t_vars = tf.trainable_variables()
    f_vars = [var for var in t_vars if 'ModelC' in var.name]
    d_vars = [var for var in t_vars if 'd_' in var.name]
    g_vars = [var for var in t_vars if 'g_' in var.name]

    init = tf.global_variables_initializer()

    sess = tf.Session()
    sess.run(init)

    f_saver = tf.train.Saver(f_vars)
    g_saver = tf.train.Saver(g_vars)
    d_saver = tf.train.Saver(d_vars)
    # f_saver.restore(sess, "./weights/target_model/model.ckpt")
    g_saver.restore(sess, "./weights/generator/gen.ckpt")
    # d_saver.restore(sess, "weights/discriminator/disc.ckpt")

    # p, xp, real_l, fake_l = sess.run([perturb, x_perturbed, f_real_probs, f_fake_probs], \
    # feed_dict={x_pl: X})
    real_l = sess.run(x_perturbed, \
            feed_dict={x_pl: X})
    # print(np.argmax(y, axis=1))
    print(real_l.shape)
Exemplo n.º 3
0
def AdvGAN(x_train,
           y_train,
           x_test,
           y_test,
           t_mu,
           t_cov,
           target=-1,
           epochs=50,
           batch_size=32):
    # placeholder definitions
    x_pl = tf.placeholder(tf.float32, [None, x_train.shape[-1]])
    y_pl = tf.placeholder(tf.float32, [None, y_train.shape[-1]])
    is_training = tf.placeholder(tf.bool, [])
    target_is_training = tf.placeholder(tf.bool, [])

    #-----------------------------------------------------------------------------------
    # MODEL DEFINITIONS
    if target != -1:
        is_targeted = True
    else:
        is_targeted = False

    # gather target model
    f = target_model(n_input=x_train.shape[-1], n_classes=y_train.shape[-1])

    # generate perturbation, add to original input image(s)
    perturb, logit_perturb = generator.generator(x_pl, is_training)
    x_perturbed = perturb + x_pl
    x_perturbed = tf.clip_by_value(x_perturbed, 0, 1)

    # pass real and perturbed image to discriminator and the target model
    d_real_logits, d_real_probs = discriminator.discriminator(
        x_pl, is_training)
    d_fake_logits, d_fake_probs = discriminator.discriminator(
        x_perturbed, is_training)

    # pass real and perturbed images to the model we are trying to fool
    f_real_logits, f_real_probs = f.Model(x_pl, target_is_training)
    f_fake_logits, f_fake_probs = f.Model(x_perturbed, target_is_training)

    # generate labels for discriminator (optionally smooth labels for stability)
    smooth = 0.0
    d_labels_real = tf.ones_like(d_real_probs) * (1 - smooth)
    d_labels_fake = tf.zeros_like(d_fake_probs)

    #-----------------------------------------------------------------------------------
    # LOSS DEFINITIONS
    # discriminator loss
    d_loss_real = tf.losses.mean_squared_error(predictions=d_real_probs,
                                               labels=d_labels_real)
    d_loss_fake = tf.losses.mean_squared_error(predictions=d_fake_probs,
                                               labels=d_labels_fake)
    d_loss = d_loss_real + d_loss_fake

    # generator loss
    g_loss_fake = tf.losses.mean_squared_error(
        predictions=d_fake_probs, labels=tf.ones_like(d_fake_probs))

    # perturbation loss (minimize overall perturbation)
    l_perturb = perturb_loss(perturb, 1.0)

    # adversarial loss (encourage misclassification)
    l_adv = adv_loss(f_fake_probs, y_pl, is_targeted)

    # loss minimizing L1 distance between target class average and perturbed vector
    # this is used to encourage realism of sample
    target_normal = tf.placeholder(tf.float32, [None, x_train.shape[-1]])
    l_tar_dist = tf.reduce_mean(
        tf.norm(target_normal - x_perturbed, axis=1, ord=1))

    # weights for generator loss function
    alpha = 1.0
    beta = 1.0
    g_loss = l_adv + alpha * g_loss_fake + l_tar_dist + beta * l_perturb

    # ----------------------------------------------------------------------------------
    # gather variables for training/restoring
    t_vars = tf.trainable_variables()
    f_vars = [var for var in t_vars if "Model_A" in var.name]
    d_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                               scope="discriminator")
    g_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                               scope="generator")

    # define optimizers for discriminator and generator
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        d_opt = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(
            d_loss, var_list=d_vars)
        g_opt = tf.train.AdamOptimizer(learning_rate=0.0002).minimize(
            g_loss, var_list=g_vars)

    # create saver objects for the target model, generator, and discriminator
    saver = tf.train.Saver(f_vars)
    g_saver = tf.train.Saver(g_vars)
    d_saver = tf.train.Saver(d_vars)

    init = tf.global_variables_initializer()

    sess = tf.Session()
    sess.run(init)

    # load the pretrained target model
    try:
        saver.restore(
            sess,
            tf.train.latest_checkpoint("./weights/target_model/Model_A/"))
    except:
        print("make sure to train the target model first...")
        sys.exit(1)

    n_batches = int(len(y_train) / batch_size)

    for epoch in range(epochs):
        # shuffle training data
        x_train, y_train = utils.shuffle(x_train, y_train)

        loss_D = 0.0
        loss_G_fake = 0.0
        loss_perturb = 0.0
        loss_adv = 0.0
        loss_target_norm = 0.0

        target_normal_np = np.random.multivariate_normal(
            t_mu, t_cov, (batch_size))
        target_normal_np = np.clip(target_normal_np, 0, 1)

        for i in range(n_batches):
            # extract batch
            batch_x, batch_y = utils.next_batch(x_train, y_train, batch_size,
                                                i)

            # if targeted, create one hot vectors of the target
            if is_targeted:
                targets = np.full((batch_y.shape[0], ), target)
                batch_y = np.eye(y_train.shape[-1])[targets]

            # train the discriminator first n times
            for _ in range(1):
                _, loss_D_batch = sess.run(
                    [d_opt, d_loss],
                    feed_dict={
                        x_pl: batch_x,
                        target_normal: target_normal_np,
                        is_training: True
                    })

            # train the generator n times
            for _ in range(1):
                _, loss_G_fake_batch, loss_adv_batch, loss_perturb_batch, loss_target_batch = \
                 sess.run([g_opt, g_loss_fake, l_adv, l_perturb, l_tar_dist], feed_dict={
                  x_pl: batch_x,
                  y_pl: batch_y,
                  target_normal: target_normal_np,
                  is_training: True,
                  target_is_training: False
                 })

            loss_D += loss_D_batch
            loss_G_fake += loss_G_fake_batch
            loss_perturb += loss_perturb_batch
            loss_adv += loss_adv_batch
            loss_target_norm += loss_target_batch

        loss_D /= n_batches
        loss_G_fake /= n_batches
        loss_perturb /= n_batches
        loss_adv /= n_batches
        loss_target_norm /= n_batches

        print("epoch %d:" % (epoch + 1))
        print("  loss_D: %.3f, loss_G_fake: %.3f" % (loss_D, loss_G_fake))
        print("  loss_perturb: %.3f, loss_adv: %.3f" %
              (loss_perturb, loss_adv))
        print("  loss_target_norm: %.3f" % (loss_target_norm))
        print()

        if epoch % 10 == 0:
            g_saver.save(sess, "weights/generator/gen.ckpt")
            d_saver.save(sess, "weights/discriminator/disc.ckpt")

    # quick sample to see some outputs
    rawpert, pert, fake_l, real_l = sess.run(
        [perturb, x_perturbed, f_fake_probs, f_real_probs],
        feed_dict={
            x_pl: x_test[:32],
            is_training: False,
            target_is_training: False
        })

    print("Original Labels:")
    print(np.argmax(y_test[:32], axis=1))
    print("Original Predictions:")
    print(np.argmax(real_l, axis=1))
    print("Perturbed Predictions:")
    print(np.argmax(fake_l, axis=1))

    # evaluate the test set
    correct_prediction = tf.equal(tf.argmax(f_fake_probs, 1),
                                  tf.argmax(y_pl, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    scores = []
    total_batches_test = int(len(y_test) / batch_size)

    for i in range(total_batches_test):
        batch_x, batch_y = utils.next_batch(x_test, y_test, batch_size, i)
        score, x_pert = sess.run(
            [accuracy, x_perturbed],
            feed_dict={
                x_pl: batch_x,
                y_pl: batch_y,
                is_training: False,
                target_is_training: False
            })
        scores.append(score)

    print("test accuracy: %0.3f" % (sum(scores) / len(scores)))

    print("finished training, saving weights")
    g_saver.save(sess, "weights/generator/gen.ckpt")
    d_saver.save(sess, "weights/discriminator/disc.ckpt")
Exemplo n.º 4
0
def AdvGAN(X, y, X_test, y_test, epochs=50, batch_size=128, target=-1):
    # placeholder definitions
    x_pl = tf.placeholder(
        tf.float32,
        [None, X.shape[1], X.shape[2], X.shape[3]])  # image placeholder
    t = tf.placeholder(tf.float32, [None, y.shape[-1]])  # target placeholder
    is_training = tf.placeholder(tf.bool, [])

    #-----------------------------------------------------------------------------------
    # MODEL DEFINITIONS
    is_targeted = False
    if target in range(0, y.shape[-1]):
        is_targeted = True

    # gather target model
    f = target_model()

    thresh = 0.3

    # generate perturbation, add to original input image(s)
    perturb = tf.clip_by_value(generator(x_pl, is_training), -thresh, thresh)
    x_perturbed = perturb + x_pl
    x_perturbed = tf.clip_by_value(x_perturbed, 0, 1)

    # pass real and perturbed image to discriminator and the target model
    d_real_logits, d_real_probs = discriminator(x_pl, is_training)
    d_fake_logits, d_fake_probs = discriminator(x_perturbed, is_training)

    # pass real and perturbed images to the model we are trying to fool
    f_real_logits, f_real_probs = f.ModelC(x_pl)
    f_fake_logits, f_fake_probs = f.ModelC(x_perturbed)

    # generate labels for discriminator (optionally smooth labels for stability)
    smooth = 0.0
    d_labels_real = tf.ones_like(d_real_probs) * (1 - smooth)
    d_labels_fake = tf.zeros_like(d_fake_probs)

    #-----------------------------------------------------------------------------------
    # LOSS DEFINITIONS
    # discriminator loss
    d_loss_real = tf.losses.mean_squared_error(predictions=d_real_probs,
                                               labels=d_labels_real)
    d_loss_fake = tf.losses.mean_squared_error(predictions=d_fake_probs,
                                               labels=d_labels_fake)
    d_loss = d_loss_real + d_loss_fake

    # generator loss
    g_loss_fake = tf.losses.mean_squared_error(
        predictions=d_fake_probs, labels=tf.ones_like(d_fake_probs))

    # perturbation loss (minimize overall perturbation)
    l_perturb = perturb_loss(perturb, thresh)

    # adversarial loss (encourage misclassification)
    l_adv = adv_loss(f_fake_probs, t, is_targeted)

    # weights for generator loss function
    alpha = 1.0
    beta = 5.0
    g_loss = l_adv + alpha * g_loss_fake + beta * l_perturb

    # ----------------------------------------------------------------------------------
    # gather variables for training/restoring
    t_vars = tf.trainable_variables()
    f_vars = [var for var in t_vars if 'ModelC' in var.name]
    d_vars = [var for var in t_vars if 'd_' in var.name]
    g_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                               scope='g_weights')

    # define optimizers for discriminator and generator
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):
        d_opt = tf.train.AdamOptimizer().minimize(d_loss, var_list=d_vars)
        g_opt = tf.train.AdamOptimizer(learning_rate=0.001).minimize(
            g_loss, var_list=g_vars)

    # create saver objects for the target model, generator, and discriminator
    saver = tf.train.Saver(f_vars)
    g_saver = tf.train.Saver(g_vars)
    d_saver = tf.train.Saver(d_vars)

    init = tf.global_variables_initializer()

    sess = tf.Session()
    sess.run(init)

    # load the pretrained target model
    try:
        saver.restore(sess, "./weights/target_model/model.ckpt")
    except:
        print("make sure to train the target model first...")
        sys.exit(1)

    total_batches = int(X.shape[0] / batch_size)

    for epoch in range(0, epochs):

        X, y = shuffle(X, y)
        loss_D_sum = 0.0
        loss_G_fake_sum = 0.0
        loss_perturb_sum = 0.0
        loss_adv_sum = 0.0

        for i in range(total_batches):

            batch_x, batch_y = next_batch(X, y, i, batch_size)

            # if targeted, create one hot vectors of the target
            if is_targeted:
                targets = np.full((batch_y.shape[0], ), target)
                batch_y = np.eye(y.shape[-1])[targets]

            # train the discriminator first n times
            for _ in range(1):
                _, loss_D_batch = sess.run([d_opt, d_loss], feed_dict={x_pl: batch_x, \
                                is_training: True})

            # train the generator n times
            for _ in range(1):
                _, loss_G_fake_batch, loss_adv_batch, loss_perturb_batch = \
                     sess.run([g_opt, g_loss_fake, l_adv, l_perturb], \
                        feed_dict={x_pl: batch_x, \
                             t: batch_y, \
                             is_training: True})
            loss_D_sum += loss_D_batch
            loss_G_fake_sum += loss_G_fake_batch
            loss_perturb_sum += loss_perturb_batch
            loss_adv_sum += loss_adv_batch

        print("epoch %d:\nloss_D: %.3f, loss_G_fake: %.3f, \
				\nloss_perturb: %.3f, loss_adv: %.3f, \n" %
              (epoch + 1, loss_D_sum / total_batches,
               loss_G_fake_sum / total_batches,
               loss_perturb_sum / total_batches, loss_adv_sum / total_batches))

        if epoch % 10 == 0:
            g_saver.save(sess, "weights/generator/gen.ckpt")
            d_saver.save(sess, "weights/discriminator/disc.ckpt")

    # evaluate the test set
    correct_prediction = tf.equal(tf.argmax(f_fake_probs, 1), tf.argmax(t, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    accs = []
    total_batches_test = int(X_test.shape[0] / batch_size)
    for i in range(total_batches_test):
        batch_x, batch_y = next_batch(X_test, y_test, i, batch_size)
        acc, x_pert = sess.run([accuracy, x_perturbed],
                               feed_dict={
                                   x_pl: batch_x,
                                   t: batch_y,
                                   is_training: False
                               })
        accs.append(acc)

    print('accuracy of test set: {}'.format(sum(accs) / len(accs)))

    # plot some images and their perturbed counterparts
    f, axarr = plt.subplots(2, 2)
    axarr[0, 0].imshow(np.squeeze(batch_x[2]), cmap='Greys_r')
    axarr[0, 1].imshow(np.squeeze(x_pert[2]), cmap='Greys_r')
    axarr[1, 0].imshow(np.squeeze(batch_x[5]), cmap='Greys_r')
    axarr[1, 1].imshow(np.squeeze(x_pert[5]), cmap='Greys_r')
    plt.show()

    print('finished training, saving weights')
    g_saver.save(sess, "weights/generator/gen.ckpt")
    d_saver.save(sess, "weights/discriminator/disc.ckpt")
Exemplo n.º 5
0
def AdvGAN(X, y, batch_size=128):
    x_real_pl = tf.placeholder(tf.float32,
                               [None, 28, 28, 1])  # image placeholder
    x_fake_pl = tf.placeholder(tf.float32,
                               [None, 28, 28, 1])  # image placeholder
    d_labels_pl = tf.placeholder(tf.float32, [None, 1])
    y_hinge_pl = tf.placeholder(tf.float32, [None, 28, 28, 1])
    t = tf.placeholder(tf.float32, [None, 10])  # target placeholder

    #-----------------------------------------------------------------------------------
    # MODEL DEFINITIONS

    # gather target model
    f = target_model()

    # generate perturbation, add to original input image(s)
    perturb = generator(x_fake_pl)
    x_perturbed = x_fake_pl + perturb

    disc_batch_x = tf.concat([x_real_pl, x_perturbed], axis=0)

    # pass perturbed image to discriminator and the target model
    d_out_logits, d_out_probs = discriminator(disc_batch_x)
    d_perturb_logits, d_perturb_probs = discriminator(x_perturbed)

    f_out_logits, f_out_probs = f.ModelC(x_perturbed)

    # generate labels for discriminator
    # smooth = 0.0
    # d_labels_real = tf.ones_like(d_real_logits) * (1 - smooth)
    # d_labels_fake = tf.zeros_like(d_perturb_logits)

    #-----------------------------------------------------------------------------------
    # LOSS DEFINITIONS
    d_loss = mse_loss(d_out_probs, d_labels_pl)

    l_adv = adv_loss(f_out_probs, t)

    l_hinge = hinge_loss(perturb, y_hinge_pl, 0.3)

    alpha = 1
    beta = 1
    g_loss = mse_loss(d_perturb_probs,
                      d_labels_pl) + alpha * l_adv + beta * l_hinge

    # ----------------------------------------------------------------------------------
    # gather variables for training/restoring
    t_vars = tf.trainable_variables()
    f_vars = [var for var in t_vars if 'ModelC' in var.name]
    d_vars = [var for var in t_vars if 'd_' in var.name]
    g_vars = [var for var in t_vars if 'g_' in var.name]

    d_opt = tf.train.AdamOptimizer().minimize(d_loss, var_list=d_vars)
    g_opt = tf.train.AdamOptimizer().minimize(g_loss, var_list=g_vars)

    saver = tf.train.Saver(f_vars)

    g_saver = tf.train.Saver(g_vars)

    d_saver = tf.train.Saver(d_vars)

    init = tf.global_variables_initializer()

    sess = tf.Session()
    sess.run(init)

    saver.restore(sess, "./weights/target_model/model.ckpt")

    for i in range(50):
        # ------------------------------------------------------------------------------
        # train the discriminator first on real and generated images
        real_image_inp = X[
            np.random.randint(0, X.shape[0], size=int(batch_size /
                                                      2)), :, :, :]
        fake_image_inp = X[
            np.random.randint(0, X.shape[0], size=int(batch_size /
                                                      2)), :, :, :]

        disc_batch_y = np.zeros([batch_size, 1])
        disc_batch_y[0:int(batch_size / 2)] = 1

        _, dl = sess.run([d_opt, d_loss], feed_dict={x_real_pl: real_image_inp, \
                    x_fake_pl: fake_image_inp, \
                    d_labels_pl: disc_batch_y})

        if i % 10 == 0:
            print('discriminator loss: ' + str(dl))

        # train the generator 5x (test)
        for _ in range(5):
            # ------------------------------------------------------------------------------
            # train the generator for perturbed images using loss for discriminator, adversarial, and hinge
            random_samples = np.random.randint(0,
                                               X.shape[0],
                                               size=int(batch_size))
            fake_image_inp = X[random_samples, ...]
            y_discrim = np.ones([batch_size, 1])
            target_class = y[random_samples]

            _, gl = sess.run([g_opt, g_loss], feed_dict={x_fake_pl: fake_image_inp, \
                        d_labels_pl: y_discrim, \
                        y_hinge_pl: np.zeros((batch_size, 28, 28, 1)), \
                        t: target_class})
        if i % 10 == 0:
            print('generator loss: ' + str(gl))

    g_saver.save(sess, "weights/generator/gen.ckpt")
    d_saver.save(sess, "weights/discriminator/disc.ckpt")
Exemplo n.º 6
0
def perturb_advgan(x, y, target=-1, batch_size=32, output_dir='.'):
    x_pl = tf.placeholder(tf.float32, [None, x.shape[-1]])
    y_pl = tf.placeholder(tf.float32, [None, y.shape[-1]])
    is_training = tf.placeholder(tf.bool, [])
    is_training_target = tf.placeholder(tf.bool, [])

    if target != -1:
        is_targeted = True
    else:
        is_targeted = False

    # generate pertubation, add to original, clip to valid expression level
    p, logit_perturb = generator.generator(x_pl, is_training)
    x_perturbed = p + x_pl
    x_perturbed = tf.clip_by_value(x_perturbed, 0, 1)

    # instantiate target model, create graphs for original and perturbed data
    f = target_model(n_input=x.shape[-1], n_classes=y.shape[-1])
    f_real_logits, f_real_probs = f.Model(x_pl, is_training_target)
    f_fake_logits, f_fake_probs = f.Model(x_perturbed, is_training_target)

    # get variables
    t_vars = tf.trainable_variables()
    f_vars = [var for var in t_vars if 'Model_A' in var.name]
    g_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                               scope='generator')

    sess = tf.Session()

    # load checkpoints
    f_saver = tf.train.Saver(f_vars)
    g_saver = tf.train.Saver(g_vars)
    f_saver.restore(
        sess, tf.train.latest_checkpoint('%s/target_model/' % (output_dir)))
    g_saver.restore(sess,
                    tf.train.latest_checkpoint('%s/generator/' % (output_dir)))

    # calculate accuracy of target model on perturbed data
    correct_prediction = tf.equal(tf.argmax(f_fake_probs, 1),
                                  tf.argmax(y_pl, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, 'float'))

    # generate perturbed samples from original samples
    n_batches = math.ceil(len(x) / batch_size)
    scores = []
    perturbations = []

    for i in range(n_batches):
        batch_x, batch_y = utils.next_batch(x, y, batch_size, i)

        if is_targeted:
            targets = np.full((batch_y.shape[0], ), target)
            batch_y_pert = np.eye(y_pl.shape[-1])[targets]

        score, _, batch_x_pert, batch_p = sess.run(
            [accuracy, f_fake_probs, x_perturbed, p],
            feed_dict={
                x_pl: batch_x,
                y_pl: batch_y_pert,
                is_training: False,
                is_training_target: False
            })
        scores.append(score)
        perturbations.append(batch_p)

    print('perturbation accuracy: %0.3f' % (sum(scores) / len(scores)))

    # return matrix of perturbed samples
    return np.vstack(perturbations).T
Exemplo n.º 7
0
def attack(x_train, y_train, target=-1, batch_size=64):
	x_pl = tf.placeholder(tf.float32, [None, x_train.shape[-1]])
	y_pl = tf.placeholder(tf.float32, [None, y_train.shape[-1]])
	is_training = tf.placeholder(tf.bool, [])
	is_training_target = tf.placeholder(tf.bool, [])

	if target != -1:
		is_targeted = True
	else:
		is_targeted = False

	# generate pertubation, add to original, clip to valid expression level
	perturb, logit_perturb = generator.generator(x_pl, is_training)
	x_perturbed = perturb + x_pl
	x_perturbed = tf.clip_by_value(x_perturbed, 0, 1)

	# instantiate target model, create graphs for original and perturbed data
	f = target_model(n_input=x_train.shape[-1], n_classes=y_train.shape[-1])
	f_real_logits, f_real_probs = f.Model(x_pl, is_training_target)
	f_fake_logits, f_fake_probs = f.Model(x_perturbed, is_training_target)

	# get variables
	t_vars = tf.trainable_variables()
	f_vars = [var for var in t_vars if "Model_A" in var.name]
	g_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="generator")

	sess = tf.Session()

	# load checkpoints
	f_saver = tf.train.Saver(f_vars)
	g_saver = tf.train.Saver(g_vars)
	f_saver.restore(sess, tf.train.latest_checkpoint("./weights/target_model/Model_A/"))
	g_saver.restore(sess, tf.train.latest_checkpoint("./weights/generator/"))

	# calculate accuracy of target model on perturbed data
	correct_prediction = tf.equal(tf.argmax(f_fake_probs, 1), tf.argmax(y_pl, 1))
	accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
	scores = []
	x_pert = []
	n_batches = int(len(x_train) / batch_size)

	for i in range(n_batches):
		batch_x, batch_y_og = utils.next_batch(x_train, y_train, batch_size, i)

		if is_targeted:
			targets = np.full((batch_y_og.shape[0],), target)
			batch_y = np.eye(y_pl.shape[-1])[targets]

		score, fake_l, x_p, p = sess.run([accuracy, f_fake_probs, x_perturbed, perturb], feed_dict={
			x_pl: batch_x,
			y_pl: batch_y,
			is_training: False,
			is_training_target: False
		})
		scores.append(score)
		x_pert.append(x_p)

	# print a sample original, perturbation, and original + perturbation
	np.set_printoptions(precision=4, suppress=True)

	print("original class is: %s" % (classes[np.argmax(batch_y_og, axis=1)[0]]))
	print(batch_x[0])
	print(p[0])
	print(x_p[0])

	np.save("perturbed_%s.npy" % (target), np.vstack(x_pert))

	print("test accuracy: %0.3f" % (sum(scores) / len(scores)))
Exemplo n.º 8
0
def attack_source_target(x, y, classes, source, target, target_mu):
	source_indices = np.where(np.argmax(y, axis=1) == source)
	x_source = x[source_indices]
	y_source = y[source_indices]

	x_pl = tf.placeholder(tf.float32, [None, x_source.shape[-1]])
	y_pl = tf.placeholder(tf.float32, [None, y_source.shape[-1]])
	is_training = tf.placeholder(tf.bool, [])
	is_training_target = tf.placeholder(tf.bool, [])

	if target != -1:
		is_targeted = True
	else:
		is_targeted = False

	# generate pertubation, add to original, clip to valid expression level
	perturb, logit_perturb = generator.generator(x_pl, is_training)
	x_perturbed = perturb + x_pl
	x_perturbed = tf.clip_by_value(x_perturbed, 0, 1)

	# instantiate target model, create graphs for original and perturbed data
	f = target_model(n_input=x.shape[-1], n_classes=y.shape[-1])
	f_real_logits, f_real_probs = f.Model(x_pl, is_training_target)
	f_fake_logits, f_fake_probs = f.Model(x_perturbed, is_training_target)

	# get variables
	t_vars = tf.trainable_variables()
	f_vars = [var for var in t_vars if "Model_A" in var.name]
	g_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="generator")

	sess = tf.Session()

	# load checkpoints
	f_saver = tf.train.Saver(f_vars)
	g_saver = tf.train.Saver(g_vars)
	f_saver.restore(sess, tf.train.latest_checkpoint("./weights/target_model/Model_A/"))
	g_saver.restore(sess, tf.train.latest_checkpoint("./weights/generator/"))

	if is_targeted:
		targets = np.full((y_source.shape[0],), target)
		batch_y = np.eye(y_pl.shape[-1])[targets]

	x_pert, p = sess.run([x_perturbed, perturb], feed_dict={
		x_pl: x_source,
		y_pl: batch_y,
		is_training: False,
		is_training_target: False
	})

	print("source class is: %s" % (classes[source]))
	print("X:")
	print(x_source[0])
	print("P:")
	print(p[0])
	print("X_adv:")
	print(x_pert[0])
	print("target_mu:")
	print(target_mu)

	# save the results in X, P, X_adv, target_mu order
	results = np.vstack([x_source[0], p[0], x_pert[0], target_mu])

	source_class = cleanse_label(classes[source])
	target_class = cleanse_label(classes[target])

	np.save("%s_to_%s.npy" % (source_class, target_class), results)