示例#1
0
def wrm(x, preds, y=None, eps=0.3, model=None, steps=15):
    """
        TensorFlow implementation of the Wasserstein distributionally
        adversarial training method. 
        :param x: the input placeholder
        :param preds: the model's output tensor
        :param y: (optional) A placeholder for the model labels. Only provide
        this parameter if you'd like to use true labels when crafting
        adversarial samples. Otherwise, model predictions are used as
        labels to avoid the "label leaking" effect (explained in this
        paper: https://arxiv.org/abs/1611.01236). Default is None.
        Labels should be one-hot-encoded.
        :param eps: .5 / gamma (Lagrange dual parameter) 
        in the ICLR paper (see link above)
        Possible values: 2.
        :param model: TF graph model
        :param steps: hwo many gradient ascent steps to take
        when finding adversarial example 
        :return: a tensor for the adversarial example
        """
    y = y / tf.reduce_sum(y, 1, keep_dims=True)

    # Compute loss
    loss = utils_tf.model_loss(y, preds, mean=False)
    grad, = tf.gradients(eps * loss, x)
    x_adv = tf.stop_gradient(x + grad)
    x = tf.stop_gradient(x)

    for t in xrange(steps):
        loss = utils_tf.model_loss(y, model(x_adv), mean=False)
        grad, = tf.gradients(eps * loss, x_adv)
        grad2, = tf.gradients(tf.nn.l2_loss(x_adv - x), x_adv)
        grad = grad - grad2
        x_adv = tf.stop_gradient(x_adv + 1. / np.sqrt(t + 2) * grad)
    return x_adv
def fgm(x, preds, y=None, eps=0.3, ord=np.inf, clip_min=None, clip_max=None):
    """
    TensorFlow implementation of the Fast Gradient Method.
    :param x: the input placeholder
    :param preds: the model's output tensor
    :param y: (optional) A placeholder for the model labels. Only provide
              this parameter if you'd like to use true labels when crafting
              adversarial samples. Otherwise, model predictions are used as
              labels to avoid the "label leaking" effect (explained in this
              paper: https://arxiv.org/abs/1611.01236). Default is None.
              Labels should be one-hot-encoded.
    :param eps: the epsilon (input variation parameter)
    :param ord: (optional) Order of the norm (mimics Numpy).
                Possible values: np.inf, 1 or 2.
    :param clip_min: Minimum float value for adversarial example components
    :param clip_max: Maximum float value for adversarial example components
    :return: a tensor for the adversarial example
    """

    if y is None:
        # Using model predictions as ground truth to avoid label leaking
        preds_max = tf.reduce_max(preds, 1, keep_dims=True)
        y = tf.to_float(tf.equal(preds, preds_max))
    y = y / tf.reduce_sum(y, 1, keep_dims=True)

    # Compute loss
    loss = utils_tf.model_loss(y, preds, mean=False)

    # Define gradient of loss wrt input
    grad, = tf.gradients(loss, x)

    if ord == np.inf:
        # Take sign of gradient
        signed_grad = tf.sign(grad)
    elif ord == 1:
        reduc_ind = list(xrange(1, len(x.get_shape())))
        signed_grad = grad / tf.reduce_sum(
            tf.abs(grad), reduction_indices=reduc_ind, keep_dims=True)
    elif ord == 2:
        reduc_ind = list(xrange(1, len(x.get_shape())))
        signed_grad = grad / tf.sqrt(
            tf.reduce_sum(
                tf.square(grad), reduction_indices=reduc_ind, keep_dims=True))
    else:
        raise NotImplementedError("Only L-inf, L1 and L2 norms are "
                                  "currently implemented.")

    # Multiply by constant epsilon
    scaled_signed_grad = eps * signed_grad

    # Add perturbation to original example to obtain adversarial example
    adv_x = tf.stop_gradient(x + scaled_signed_grad)

    # If clipping is needed, reset all values outside of [clip_min, clip_max]
    if (clip_min is not None) and (clip_max is not None):
        adv_x = tf.clip_by_value(adv_x, clip_min, clip_max)

    return adv_x
示例#3
0
def wrm(x_nat, x_init, preds, y=None, eps=0.3, ord=2, model=None, steps=15):
    """
        TensorFlow implementation of the Wasserstein distributionally
        adversarial training method.
        :param x: the input placeholder
        :param preds: the model's output tensor
        :param y: (optional) A placeholder for the model labels. Only provide
        this parameter if you'd like to use true labels when crafting
        adversarial samples. Otherwise, model predictions are used as
        labels to avoid the "label leaking" effect (explained in this
        paper: https://arxiv.org/abs/1611.01236). Default is None.
        Labels should be one-hot-encoded.
        :param eps: .5 / gamma (Lagrange dual parameter)
        in the ICLR paper (see link above)
        :param ord: (optional) Order of the norm (mimics Numpy).
        Possible values: 2.
        :param model: TF graph model
        :param steps: hwo many gradient ascent steps to take
        when finding adversarial example
        :return: a tensor for the adversarial example
        """

    if y is None:
        # Using model predictions as ground truth to avoid label leaking
        preds_max = tf.reduce_max(preds, 1, keep_dims=True)
        y = tf.to_float(tf.equal(preds, preds_max))
    y = y / tf.reduce_sum(y, 1, keep_dims=True)

    # what is the purpose of these 4 lines??
    # it seems like they take a gradient ascent step of the objective function (without the lagrangian term)
    # not sure why this is not incorporated into the loop
    # loss = utils_tf.model_loss(y, preds, mean=False)
    # grad, = tf.gradients(eps*loss, x)
    # # the above two lines compute the derivative of the objective function with respect to the input x
    # x_adv = tf.stop_gradient(x+grad)
    # x = tf.stop_gradient(x)
    x_adv = tf.stop_gradient(x_init)
    x_nat = tf.stop_gradient(x_nat)

    for t in xrange(steps):
        loss = utils_tf.model_loss(y, model(x_adv), mean=False)
        grad, = tf.gradients(eps * loss, x_adv)
        grad2, = tf.gradients(tf.nn.l2_loss(x_adv - x_nat), x_adv)
        grad = grad - grad2
        x_adv = tf.stop_gradient(x_adv + 1. / np.sqrt(t + 1) * grad)
    return x_adv
示例#4
0
def fgsm(x, predictions, eps, clip_min=None, clip_max=None):
    """
    TensorFlow implementation of the Fast Gradient
    Sign method.
    :param x: the input placeholder
    :param predictions: the model's output tensor
    :param eps: the epsilon (input variation parameter)
    :param clip_min: optional parameter that can be used to set a minimum
                    value for components of the example returned
    :param clip_max: optional parameter that can be used to set a maximum
                    value for components of the example returned
    :return: a tensor for the adversarial example
    """

    # just move away from the manifold
    y = tf.to_float(tf.equal(predictions, tf.reduce_max(predictions, 1, keep_dims=True)))
    y = y / tf.reduce_sum(y, 1, keep_dims=True)
    loss = utils_tf.model_loss(y, predictions, mean=False)
    
        

    # Define gradient of loss wrt input
    grad, = tf.gradients(loss, x)

    # Take sign of gradient
    signed_grad = tf.sign(grad)

    # Multiply by constant epsilon
    scaled_signed_grad = eps * signed_grad

    # Add perturbation to original example to obtain adversarial example
    adv_x = tf.stop_gradient(x + scaled_signed_grad)

    # If clipping is needed, reset all values outside of [clip_min, clip_max]
    if (clip_min is not None) and (clip_max is not None):
        adv_x = tf.clip_by_value(adv_x, clip_min, clip_max)

    return adv_x
    def attack_single_step(self, x, eta, y):
        """
        Given the original image and the perturbation computed so far, computes
        a new perturbation.

        :param x: A tensor with the original input.
        :param eta: A tensor the same shape as x that holds the perturbation.
        :param y: A tensor with the target labels or ground-truth labels.
        """
        import tensorflow as tf
        from utils_tf import model_loss, clip_eta

        adv_x = x + eta
        preds = self.model.get_probs(adv_x)
        loss = model_loss(y, preds)
        if self.targeted:
            loss = -loss
        grad, = tf.gradients(loss, adv_x)
        scaled_signed_grad = self.eps_iter * tf.sign(grad)
        adv_x = adv_x + scaled_signed_grad
        adv_x = tf.clip_by_value(adv_x, self.clip_min, self.clip_max)
        eta = adv_x - x
        eta = clip_eta(eta, self.ord, self.eps)
        return x, eta
def main(argv=None):

    keras.layers.core.K.set_learning_phase(1)
    manual_variable_initialization(True)

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get toy samples
    X_train, Y_train, X_test, Y_test = toysamples()

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 2))
    y = tf.placeholder(tf.float32, shape=(None, 2))

    # Define TF model graph
    # model = shallow_model(activation='elu')
    # predictions = model(x)
    # wrm = WassersteinRobustMethod(model, sess=sess)
    wrm_params = {'eps': 0.25, 'ord': 2, 'y': y, 'steps': 15}
    # predictions_adv_wrm = model(wrm.generate(x, **wrm_params))
    #
    # def evaluate():
    #     # Evaluate the accuracy of the MNIST model on legitimate test examples
    #     accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params)
    #     print('Test accuracy on legitimate test examples: %0.4f' % accuracy)
    #
    #     # Accuracy of the model on Wasserstein adversarial examples
    #     accuracy_adv_wass = model_eval(sess, x, y, predictions_adv_wrm, X_test, \
    #                                    Y_test, args=eval_params)
    #     print('Test accuracy on Wasserstein examples: %0.4f\n' % accuracy_adv_wass)
    #
    # # Train the model
    # model_train(sess, x, y, predictions, X_train, Y_train, evaluate=evaluate, \
    #             args=train_params, save=False)
    # model.model.save(FLAGS.train_dir + '/' + FLAGS.filename_erm)
    # model.save_weights(FLAGS.train_dir + '/' + FLAGS.filename_erm)

    # print('')
    # print("Repeating the process, using Wasserstein adversarial training")
    # # Redefine TF model graph
    # model_adv = shallow_model(activation='elu')
    # predictions_adv = model_adv(x)
    # wrm2 = WassersteinRobustMethod(model_adv, sess=sess)
    # predictions_adv_adv_wrm = model_adv(wrm2.generate(x, **wrm_params))
    #
    # def evaluate_adv():
    #     # Accuracy of adversarially trained model on legitimate test inputs
    #     accuracy = model_eval(sess, x, y, predictions_adv, X_test, Y_test, args=eval_params)
    #     print('Test accuracy on legitimate test examples: %0.4f' % accuracy)
    #
    #     # Accuracy of the adversarially trained model on Wasserstein adversarial examples
    #     # accuracy_adv_wass = model_eval(sess, x, y, predictions_adv_adv_wrm, \
    #     #                                X_test, Y_test, args=eval_params)
    #     # print('Test accuracy on Wasserstein examples: %0.4f\n' % accuracy_adv_wass)
    #
    # model_train(sess, x, y, predictions_adv_adv_wrm, X_train, Y_train, \
    #             predictions_adv=predictions_adv_adv_wrm, evaluate=evaluate_adv, \
    #             args=train_params, save=False)
    # model_adv.save_weights(FLAGS.train_dir + '/' + FLAGS.filename_wrm)

    print('loading ' + FLAGS.train_dir + '/' + FLAGS.filename_wrm)
    model2 = shallow_model(activation='elu')
    model2.load_weights(FLAGS.train_dir + '/' + FLAGS.filename_wrm)

    # robustness certificate validate
    g = tf.placeholder(tf.float32)
    wrm = WassersteinRobustMethod(model2, sess=sess)
    wrm_params = {'eps': 0.5 / g, 'ord': 2, 'y': y, 'steps': 15}
    x_adv = wrm.generate(x, **wrm_params)
    # predictions = model2(x)
    # accuracy = model_eval(sess, x, y, predictions, X_train, Y_train, args=eval_params)
    # print(accuracy)
    robust_surrogate = model_loss(y, model2(x_adv), mean=True)
    rho = 2 * tf.nn.l2_loss(x_adv - x)

    with sess.as_default():
        train_rho = []
        train_loss = []
        test_rho = []
        test_loss = []

        for idx in np.arange(1.32, 5, 0.1):
            gamma = idx

            adv = sess.run(x_adv, feed_dict={x: X_train, y: Y_train, g: gamma})
            # print(adv[0], adv[1])

            posx, posy = adv.T
            plt.plot(posx, posy, 'x')
            plt.axis('equal')
            plt.show()

            certificate, rho_train = sess.run([robust_surrogate, rho],
                                              feed_dict={
                                                  x: X_train,
                                                  y: Y_train,
                                                  g: gamma
                                              })
            test_worst_loss, rho_test = sess.run([robust_surrogate, rho],
                                                 feed_dict={
                                                     x: X_test,
                                                     y: Y_test,
                                                     g: gamma
                                                 })

            # print(X_train.shape[0], X_test.shape[0])
            print(certificate, rho_train / X_train.shape[0], test_worst_loss,
                  rho_test / X_test.shape[0])

            train_rho.append(rho_train / X_train.shape[0])
            train_loss.append(certificate)
            test_rho.append(rho_test / X_test.shape[0])
            test_loss.append(test_worst_loss)

        certificate, rho_train = sess.run([robust_surrogate, rho],
                                          feed_dict={
                                              x: X_train,
                                              y: Y_train,
                                              g: 2.0
                                          })
        certificate = certificate - 2 * rho_train / X_train.shape[0]
        print(rho_train / X_train.shape[0])

        train_rho = np.arange(0, 0.8, 0.05)
        plt.plot(train_rho, certificate + train_rho * 2, '-r', linewidth=1.0)
        plt.plot(test_rho, test_loss, '-b', linewidth=1.0)
        plt.show()