Exemplo n.º 1
0
def print_accuracies(filepath, train_start=TRAIN_START, train_end=TRAIN_END,
                     test_start=TEST_START, test_end=TEST_END,
                     batch_size=BATCH_SIZE, which_set=WHICH_SET,
                     base_eps_iter=BASE_EPS_ITER,
                     nb_iter=NB_ITER):
  """
  Load a saved model and print out its accuracy on different data distributions

  This function works by running a single attack on each example.
  This provides a reasonable estimate of the true failure rate quickly, so
  long as the model does not suffer from gradient masking.
  However, this estimate is mostly intended for development work and not
  for publication. A more accurate estimate may be obtained by running
  an attack bundler instead.

  :param filepath: path to model to evaluate
  :param train_start: index of first training set example to use
  :param train_end: index of last training set example to use
  :param test_start: index of first test set example to use
  :param test_end: index of last test set example to use
  :param batch_size: size of evaluation batches
  :param which_set: 'train' or 'test'
  :param base_eps_iter: step size if the data were in [0,1]
    (Step size will be rescaled proportional to the actual data range)
  :param nb_iter: Number of iterations of PGD to run per class
  """

  # Set TF random seed to improve reproducibility
  tf.set_random_seed(20181014)
  set_log_level(logging.INFO)
  sess = tf.Session()

  with sess.as_default():
    model = load(filepath)
  assert len(model.get_params()) > 0
  factory = model.dataset_factory
  factory.kwargs['train_start'] = train_start
  factory.kwargs['train_end'] = train_end
  factory.kwargs['test_start'] = test_start
  factory.kwargs['test_end'] = test_end
  dataset = factory()


  x_data, y_data = dataset.get_set(which_set)

  impl(sess, model, dataset, factory, x_data, y_data, base_eps_iter, nb_iter)
Exemplo n.º 2
0
def main(argv):

  model_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)

  if model_file is None:
    print('No model found')
    sys.exit()

  set_log_level(logging.DEBUG)

  sess = tf.Session()
  with sess.as_default():

    model = make_wresnet()
    saver = tf.train.Saver()
    # Restore the checkpoint
    saver.restore(sess, model_file)
    SCOPE = "cifar10_challenge"
    model2 = make_wresnet(scope=SCOPE)
    assert len(model.get_vars()) == len(model2.get_vars())
    found = [False] * len(model2.get_vars())
    for var1 in model.get_vars():
      var1_found = False
      var2_name = SCOPE + "/" + var1.name
      for idx, var2 in enumerate(model2.get_vars()):
        if var2.name == var2_name:
          var1_found = True
          found[idx] = True
          sess.run(tf.assign(var2, var1))
          break
      assert var1_found, var1.name
    assert all(found)

    model2.dataset_factory = Factory(CIFAR10, {"max_val": 255})

    serial.save("model.joblib", model2)
Exemplo n.º 3
0
def SNNL_example(train_start=0,
                 train_end=60000,
                 test_start=0,
                 test_end=10000,
                 nb_epochs=NB_EPOCHS,
                 batch_size=BATCH_SIZE,
                 learning_rate=LEARNING_RATE,
                 nb_filters=NB_FILTERS,
                 SNNL_factor=SNNL_FACTOR,
                 output_dir=OUTPUT_DIR):
    """
  A simple model trained to minimize Cross Entropy and Maximize Soft Nearest
  Neighbor Loss at each internal layer. This outputs a TSNE of the sign of
  the adversarial gradients of a trained model. A model with a negative
  SNNL_factor will show little or no class clusters, while a model with a
  0 SNNL_factor will have class clusters in the adversarial gradient direction.
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :param nb_epochs: number of epochs to train model
  :param batch_size: size of training batches
  :param learning_rate: learning rate for training
  :param SNNL_factor: multiplier for Soft Nearest Neighbor Loss
  :return: an AccuracyReport object
  """

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Create TF session
    sess = tf.Session()

    # Get MNIST data
    mnist = MNIST(train_start=train_start,
                  train_end=train_end,
                  test_start=test_start,
                  test_end=test_end)
    x_train, y_train = mnist.get_set('train')
    x_test, y_test = mnist.get_set('test')

    # Use Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    eval_params = {'batch_size': batch_size}
    rng = np.random.RandomState([2017, 8, 30])

    def do_eval(preds, x_set, y_set, report_key):
        acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params)
        setattr(report, report_key, acc)
        print('Test accuracy on legitimate examples: %0.4f' % (acc))

    model = ModelBasicCNN('model', nb_classes, nb_filters)
    preds = model.get_logits(x)
    cross_entropy_loss = CrossEntropy(model)
    if not SNNL_factor:
        loss = cross_entropy_loss
    else:
        loss = SNNLCrossEntropy(model,
                                factor=SNNL_factor,
                                optimize_temperature=False)

    def evaluate():
        do_eval(preds, x_test, y_test, 'clean_train_clean_eval')

    train(sess,
          loss,
          x_train,
          y_train,
          evaluate=evaluate,
          args=train_params,
          rng=rng,
          var_list=model.get_params())

    do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval')

    def imscatter(points, images, ax=None, zoom=1, cmap="hot"):
        if ax is None:
            ax = plt.gca()
        artists = []
        i = 0
        if not isinstance(cmap, list):
            cmap = [cmap] * len(points)
        for x0, y0 in points:
            transformed = (images[i] - np.min(images[i])) / \
                (np.max(images[i]) - np.min(images[i]))
            im = OffsetImage(transformed[:, :, 0], zoom=zoom, cmap=cmap[i])
            ab = AnnotationBbox(im, (x0, y0), xycoords='data', frameon=False)
            artists.append(ax.add_artist(ab))
            i += 1
        ax.update_datalim(np.column_stack(np.transpose(points)))
        ax.autoscale()
        ax.get_xaxis().set_ticks([])
        ax.get_yaxis().set_ticks([])
        return artists

    adv_grads = tf.sign(tf.gradients(cross_entropy_loss.fprop(x, y), x))
    feed_dict = {x: x_test[:batch_size], y: y_test[:batch_size]}
    adv_grads_val = sess.run(adv_grads, feed_dict=feed_dict)
    adv_grads_val = np.reshape(adv_grads_val,
                               (batch_size, img_rows * img_cols))

    X_embedded = TSNE(n_components=2, verbose=0).fit_transform(adv_grads_val)
    plt.figure(num=None,
               figsize=(50, 50),
               dpi=40,
               facecolor='w',
               edgecolor='k')
    plt.title(
        "TSNE of Sign of Adv Gradients, SNNLCrossEntropy Model, factor:" +
        str(FLAGS.SNNL_factor),
        fontsize=42)
    imscatter(X_embedded, x_test[:batch_size], zoom=2, cmap="Purples")
    plt.savefig(output_dir + 'adversarial_gradients_SNNL_factor_' +
                str(SNNL_factor) + '.png')
Exemplo n.º 4
0
def cifar10_tutorial(train_start=0,
                     train_end=60000,
                     test_start=0,
                     test_end=10000,
                     nb_epochs=NB_EPOCHS,
                     batch_size=BATCH_SIZE,
                     learning_rate=LEARNING_RATE,
                     clean_train=CLEAN_TRAIN,
                     testing=False,
                     backprop_through_attack=BACKPROP_THROUGH_ATTACK,
                     nb_filters=NB_FILTERS,
                     num_threads=None,
                     label_smoothing=0.1):
    """
  CIFAR10 cleverhans tutorial
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :param nb_epochs: number of epochs to train model
  :param batch_size: size of training batches
  :param learning_rate: learning rate for training
  :param clean_train: perform normal training on clean examples only
                      before performing adversarial training.
  :param testing: if true, complete an AccuracyReport for unit tests
                  to verify that performance is adequate
  :param backprop_through_attack: If True, backprop through adversarial
                                  example construction process during
                                  adversarial training.
  :param label_smoothing: float, amount of label smoothing for cross entropy
  :return: an AccuracyReport object
  """

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Create TF session
    if num_threads:
        config_args = dict(intra_op_parallelism_threads=1)
    else:
        config_args = {}
    sess = tf.Session(config=tf.ConfigProto(**config_args))

    # Get CIFAR10 data
    data = CIFAR10(train_start=train_start,
                   train_end=train_end,
                   test_start=test_start,
                   test_end=test_end)
    dataset_size = data.x_train.shape[0]
    dataset_train = data.to_tensorflow()[0]
    dataset_train = dataset_train.map(
        lambda x, y: (random_shift(random_horizontal_flip(x)), y), 4)
    dataset_train = dataset_train.batch(batch_size)
    dataset_train = dataset_train.prefetch(16)
    x_train, y_train = data.get_set('train')
    x_test, y_test = data.get_set('test')

    # Use Image Parameters
    img_rows, img_cols, nchannels = x_test.shape[1:4]
    nb_classes = y_test.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    eval_params = {'batch_size': batch_size}
    fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}
    rng = np.random.RandomState([2017, 8, 30])

    def do_eval(preds, x_set, y_set, report_key, is_adv=None):
        acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params)
        setattr(report, report_key, acc)
        if is_adv is None:
            report_text = None
        elif is_adv:
            report_text = 'adversarial'
        else:
            report_text = 'legitimate'
        if report_text:
            print('Test accuracy on %s examples: %0.4f' % (report_text, acc))

    if clean_train:
        model = ModelAllConvolutional('model1',
                                      nb_classes,
                                      nb_filters,
                                      input_shape=[32, 32, 3])
        preds = model.get_logits(x)
        loss = CrossEntropy(model, smoothing=label_smoothing)

        def evaluate():
            do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False)

        train(sess,
              loss,
              None,
              None,
              dataset_train=dataset_train,
              dataset_size=dataset_size,
              evaluate=evaluate,
              args=train_params,
              rng=rng,
              var_list=model.get_params())

        # Calculate training error
        if testing:
            do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval')

        # Initialize the Fast Gradient Sign Method (FGSM) attack object and
        # graph
        fgsm = FastGradientMethod(model, sess=sess)
        adv_x = fgsm.generate(x, **fgsm_params)
        preds_adv = model.get_logits(adv_x)

        # Evaluate the accuracy of the MNIST model on adversarial examples
        do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True)

        # Calculate training error
        if testing:
            do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval')

        print('Repeating the process, using adversarial training')

    # Create a new model and train it to be robust to FastGradientMethod
    model2 = ModelAllConvolutional('model2',
                                   nb_classes,
                                   nb_filters,
                                   input_shape=[32, 32, 3])
    fgsm2 = FastGradientMethod(model2, sess=sess)

    def attack(x):
        return fgsm2.generate(x, **fgsm_params)

    loss2 = CrossEntropy(model2, smoothing=label_smoothing, attack=attack)
    preds2 = model2.get_logits(x)
    adv_x2 = attack(x)

    if not backprop_through_attack:
        # For the fgsm attack used in this tutorial, the attack has zero
        # gradient so enabling this flag does not change the gradient.
        # For some other attacks, enabling this flag increases the cost of
        # training, but gives the defender the ability to anticipate how
        # the atacker will change their strategy in response to updates to
        # the defender's parameters.
        adv_x2 = tf.stop_gradient(adv_x2)
    preds2_adv = model2.get_logits(adv_x2)

    def evaluate2():
        # Accuracy of adversarially trained model on legitimate test inputs
        do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False)
        # Accuracy of the adversarially trained model on adversarial examples
        do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True)

    # Perform and evaluate adversarial training
    train(sess,
          loss2,
          None,
          None,
          dataset_train=dataset_train,
          dataset_size=dataset_size,
          evaluate=evaluate2,
          args=train_params,
          rng=rng,
          var_list=model2.get_params())

    # Calculate training errors
    if testing:
        do_eval(preds2, x_train, y_train, 'train_adv_train_clean_eval')
        do_eval(preds2_adv, x_train, y_train, 'train_adv_train_adv_eval')

    return report
Exemplo n.º 5
0
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0,
                        test_end=10000, viz_enabled=VIZ_ENABLED,
                        nb_epochs=NB_EPOCHS, batch_size=BATCH_SIZE,
                        source_samples=SOURCE_SAMPLES,
                        learning_rate=LEARNING_RATE):
  """
  MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :param viz_enabled: (boolean) activate plots of adversarial examples
  :param nb_epochs: number of epochs to train model
  :param batch_size: size of training batches
  :param nb_classes: number of output classes
  :param source_samples: number of test inputs to attack
  :param learning_rate: learning rate for training
  :return: an AccuracyReport object
  """
  # Object used to keep track of (and return) key accuracies
  report = AccuracyReport()

  # Set TF random seed to improve reproducibility
  tf.set_random_seed(1234)

  # Create TF session and set as Keras backend session
  sess = tf.Session()
  print("Created TensorFlow session.")

  set_log_level(logging.DEBUG)

  # Get MNIST test data
  mnist = MNIST(train_start=train_start, train_end=train_end,
                test_start=test_start, test_end=test_end)
  x_train, y_train = mnist.get_set('train')
  x_test, y_test = mnist.get_set('test')

  # Obtain Image Parameters
  img_rows, img_cols, nchannels = x_train.shape[1:4]
  nb_classes = y_train.shape[1]

  # Define input TF placeholder
  x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                        nchannels))
  y = tf.placeholder(tf.float32, shape=(None, nb_classes))

  nb_filters = 64
  # Define TF model graph
  model = ModelBasicCNN('model1', nb_classes, nb_filters)
  preds = model.get_logits(x)
  loss = CrossEntropy(model, smoothing=0.1)
  print("Defined TensorFlow model graph.")

  ###########################################################################
  # Training the model using TensorFlow
  ###########################################################################

  # Train an MNIST model
  train_params = {
      'nb_epochs': nb_epochs,
      'batch_size': batch_size,
      'learning_rate': learning_rate
  }
  sess.run(tf.global_variables_initializer())
  rng = np.random.RandomState([2017, 8, 30])
  train(sess, loss, x_train, y_train, args=train_params, rng=rng)

  # Evaluate the accuracy of the MNIST model on legitimate test examples
  eval_params = {'batch_size': batch_size}
  accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
  assert x_test.shape[0] == test_end - test_start, x_test.shape
  print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
  report.clean_train_clean_eval = accuracy

  ###########################################################################
  # Craft adversarial examples using the Jacobian-based saliency map approach
  ###########################################################################
  print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) +
        ' adversarial examples')

  # Keep track of success (adversarial example classified in target)
  results = np.zeros((nb_classes, source_samples), dtype='i')

  # Rate of perturbed features for each test set example and target class
  perturbations = np.zeros((nb_classes, source_samples), dtype='f')

  # Initialize our array for grid visualization
  grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels)
  grid_viz_data = np.zeros(grid_shape, dtype='f')

  # Instantiate a SaliencyMapMethod attack object
  jsma = SaliencyMapMethod(model, sess=sess)
  jsma_params = {'theta': 1., 'gamma': 0.1,
                 'clip_min': 0., 'clip_max': 1.,
                 'y_target': None}

  figure = None
  # Loop over the samples we want to perturb into adversarial examples
  for sample_ind in xrange(0, source_samples):
    print('--------------------------------------')
    print('Attacking input %i/%i' % (sample_ind + 1, source_samples))
    sample = x_test[sample_ind:(sample_ind + 1)]

    # We want to find an adversarial example for each possible target class
    # (i.e. all classes that differ from the label given in the dataset)
    current_class = int(np.argmax(y_test[sample_ind]))
    target_classes = other_classes(nb_classes, current_class)

    # For the grid visualization, keep original images along the diagonal
    grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
        sample, (img_rows, img_cols, nchannels))

    # Loop over all target classes
    for target in target_classes:
      print('Generating adv. example for target class %i' % target)

      # This call runs the Jacobian-based saliency map approach
      one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
      one_hot_target[0, target] = 1
      jsma_params['y_target'] = one_hot_target
      adv_x = jsma.generate_np(sample, **jsma_params)

      # Check if success was achieved
      res = int(model_argmax(sess, x, preds, adv_x) == target)

      # Computer number of modified features
      adv_x_reshape = adv_x.reshape(-1)
      test_in_reshape = x_test[sample_ind].reshape(-1)
      nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
      percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

      # Display the original and adversarial images side-by-side
      if viz_enabled:
        figure = pair_visual(
            np.reshape(sample, (img_rows, img_cols, nchannels)),
            np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure)

      # Add our adversarial example to our grid data
      grid_viz_data[target, current_class, :, :, :] = np.reshape(
          adv_x, (img_rows, img_cols, nchannels))

      # Update the arrays for later analysis
      results[target, sample_ind] = res
      perturbations[target, sample_ind] = percent_perturb

  print('--------------------------------------')

  # Compute the number of adversarial examples that were successfully found
  nb_targets_tried = ((nb_classes - 1) * source_samples)
  succ_rate = float(np.sum(results)) / nb_targets_tried
  print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate))
  report.clean_train_adv_eval = 1. - succ_rate

  # Compute the average distortion introduced by the algorithm
  percent_perturbed = np.mean(perturbations)
  print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed))

  # Compute the average distortion introduced for successful samples only
  percent_perturb_succ = np.mean(perturbations * (results == 1))
  print('Avg. rate of perturbed features for successful '
        'adversarial examples {0:.4f}'.format(percent_perturb_succ))

  # Close TF session
  sess.close()

  # Finally, block & display a grid of all the adversarial examples
  if viz_enabled:
    import matplotlib.pyplot as plt
    plt.close(figure)
    _ = grid_visual(grid_viz_data)

  return report
def evaluate_model(filepath,
                   train_start=0, train_end=60000, test_start=0,
                   test_end=10000, batch_size=128,
                   testing=False, num_threads=None):
  """
  Run evaluation on a saved model
  :param filepath: path to model to evaluate
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :param batch_size: size of evaluation batches
  """

  # Set TF random seed to improve reproducibility
  tf.set_random_seed(1234)

  # Set logging level to see debug information
  set_log_level(logging.INFO)

  # Create TF session
  if num_threads:
    config_args = dict(intra_op_parallelism_threads=1)
  else:
    config_args = {}
  sess = tf.Session(config=tf.ConfigProto(**config_args))

  # Get MNIST test data
  mnist = MNIST(train_start=train_start, train_end=train_end,
                test_start=test_start, test_end=test_end)
  x_train, y_train = mnist.get_set('train')
  x_test, y_test = mnist.get_set('test')

  # Use Image Parameters
  img_rows, img_cols, nchannels = x_train.shape[1:4]
  nb_classes = y_train.shape[1]

  # Define input TF placeholder
  x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                        nchannels))
  y = tf.placeholder(tf.float32, shape=(None, nb_classes))

  eval_params = {'batch_size': batch_size}
  fgsm_params = {
      'eps': 0.3,
      'clip_min': 0.,
      'clip_max': 1.
  }

  def do_eval(preds, x_set, y_set, report_key, is_adv=None):
    acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params)
    if is_adv is None:
      report_text = None
    elif is_adv:
      report_text = 'adversarial'
    else:
      report_text = 'legitimate'
    if report_text:
      print('Test accuracy on %s examples: %0.4f' % (report_text, acc))

  with sess.as_default():
    model = load(filepath)
  assert len(model.get_params()) > 0

  # Initialize the Fast Gradient Sign Method (FGSM) attack object and
  # graph
  fgsm = FastGradientMethod(model, sess=sess)
  adv_x = fgsm.generate(x, **fgsm_params)
  preds_adv = model.get_logits(adv_x)
  preds = model.get_logits(x)

  # Evaluate the accuracy of the MNIST model on adversarial examples
  do_eval(preds, x_test, y_test, 'train_clean_train_clean_eval', False)
  do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True)
Exemplo n.º 7
0
def mnist_tutorial(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=NB_EPOCHS,
                   batch_size=BATCH_SIZE,
                   learning_rate=LEARNING_RATE,
                   clean_train=True,
                   testing=False,
                   backprop_through_attack=False,
                   nb_filters=NB_FILTERS,
                   num_threads=None,
                   attack_string=None):
    """
  MNIST cleverhans tutorial
  :param train_start: index of first training set example.
  :param train_end: index of last training set example.
  :param test_start: index of first test set example.
  :param test_end: index of last test set example.
  :param nb_epochs: number of epochs to train model.
  :param batch_size: size of training batches.
  :param learning_rate: learning rate for training.
  :param clean_train: perform normal training on clean examples only
                      before performing adversarial training.
  :param testing: if true, complete an AccuracyReport for unit tests
                  to verify that performance is adequate.
  :param backprop_through_attack: If True, backprop through adversarial
                                  example construction process during
                                  adversarial training.
  :param nb_filters: number of filters in the CNN used for training.
  :param num_threads: number of threads used for running the process.
  :param attack_string: attack name for crafting adversarial attacks and
                          adversarial training, in string format.
  :return: an AccuracyReport object
  """

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Get MNIST test data
    mnist = MNIST(train_start=train_start,
                  train_end=train_end,
                  test_start=test_start,
                  test_end=test_end)
    X_train, Y_train = mnist.get_set('train')
    X_test, Y_test = mnist.get_set('test')

    # Use label smoothing
    assert Y_train.shape[1] == 10
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }

    # Initialize the attack object
    attack_class = attack_selection(attack_string)
    attack_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}

    rng = np.random.RandomState([2018, 6, 18])
    if clean_train:
        model = ModelBasicCNNTFE(nb_filters=nb_filters)

        def evaluate_clean():
            """Evaluate the accuracy of the MNIST model on legitimate test
      examples
      """
            eval_params = {'batch_size': batch_size}
            acc = model_eval(model, X_test, Y_test, args=eval_params)
            report.clean_train_clean_eval = acc
            assert X_test.shape[0] == test_end - test_start, X_test.shape
            print('Test accuracy on legitimate examples: %0.4f' % acc)

        train(model,
              X_train,
              Y_train,
              evaluate=evaluate_clean,
              args=train_params,
              rng=rng,
              var_list=model.get_params())

        if testing:
            # Calculate training error
            eval_params = {'batch_size': batch_size}
            acc = model_eval(model, X_train, Y_train, args=eval_params)
            report.train_clean_train_clean_eval = acc

        # Evaluate the accuracy of the MNIST model on adversarial examples
        eval_par = {'batch_size': batch_size}
        attack = attack_class(model)
        acc = model_eval(model,
                         X_test,
                         Y_test,
                         args=eval_par,
                         attack=attack,
                         attack_args=attack_params)
        print('Test accuracy on adversarial examples: %0.4f\n' % acc)
        report.clean_train_adv_eval = acc

        # Calculate training error
        if testing:
            eval_par = {'batch_size': batch_size}
            acc = model_eval(model,
                             X_train,
                             Y_train,
                             args=eval_par,
                             attack=attack,
                             attack_args=attack_params)
            print('Train accuracy on adversarial examples: %0.4f\n' % acc)
            report.train_clean_train_adv_eval = acc

        attack = None
        print("Repeating the process, using adversarial training")

    model_adv_train = ModelBasicCNNTFE(nb_filters=nb_filters)
    attack = attack_class(model_adv_train)

    def evaluate_adv():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(model_adv_train,
                              X_test,
                              Y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy
        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(model_adv_train,
                              X_test,
                              Y_test,
                              args=eval_params,
                              attack=attack,
                              attack_args=attack_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy

    # Perform and evaluate adversarial training
    train(model_adv_train,
          X_train,
          Y_train,
          evaluate=evaluate_adv,
          args=train_params,
          rng=rng,
          var_list=model_adv_train.get_params(),
          attack=attack,
          attack_args=attack_params)

    # Calculate training errors
    if testing:
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(model_adv_train,
                              X_train,
                              Y_train,
                              args=eval_params,
                              attack=None,
                              attack_args=None)
        report.train_adv_train_clean_eval = accuracy
        accuracy = model_eval(model_adv_train,
                              X_train,
                              Y_train,
                              args=eval_params,
                              attack=attack,
                              attack_args=attack_params)
        report.train_adv_train_adv_eval = accuracy
    return report
Exemplo n.º 8
0
def make_confidence_report(filepath,
                           train_start=TRAIN_START,
                           train_end=TRAIN_END,
                           test_start=TEST_START,
                           test_end=TEST_END,
                           batch_size=BATCH_SIZE,
                           which_set=WHICH_SET,
                           mc_batch_size=MC_BATCH_SIZE,
                           report_path=REPORT_PATH,
                           base_eps_iter=BASE_EPS_ITER,
                           nb_iter=NB_ITER,
                           save_advx=SAVE_ADVX):
    """
  Load a saved model, gather its predictions, and save a confidence report.


  This function works by running a single MaxConfidence attack on each example.
  This provides a reasonable estimate of the true failure rate quickly, so
  long as the model does not suffer from gradient masking.
  However, this estimate is mostly intended for development work and not
  for publication. A more accurate estimate may be obtained by running
  make_confidence_report_bundled.py instead.

  :param filepath: path to model to evaluate
  :param train_start: index of first training set example to use
  :param train_end: index of last training set example to use
  :param test_start: index of first test set example to use
  :param test_end: index of last test set example to use
  :param batch_size: size of evaluation batches
  :param which_set: 'train' or 'test'
  :param mc_batch_size: batch size for MaxConfidence attack
  :param base_eps_iter: step size if the data were in [0,1]
    (Step size will be rescaled proportional to the actual data range)
  :param nb_iter: Number of iterations of PGD to run per class
  :param save_advx: bool. If True, saves the adversarial examples to disk.
    On by default, but can be turned off to save memory, etc.
  """

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Set logging level to see debug information
    set_log_level(logging.INFO)

    # Create TF session
    sess = tf.Session()

    if report_path is None:
        assert filepath.endswith('.joblib')
        report_path = filepath[:-len('.joblib')] + "_report.joblib"

    with sess.as_default():
        model = load(filepath)
    assert len(model.get_params()) > 0
    factory = model.dataset_factory
    factory.kwargs['train_start'] = train_start
    factory.kwargs['train_end'] = train_end
    factory.kwargs['test_start'] = test_start
    factory.kwargs['test_end'] = test_end
    dataset = factory()

    center = dataset.kwargs['center']
    max_val = dataset.kwargs['max_val']
    value_range = max_val * (1. + center)
    min_value = 0. - center * max_val

    if 'CIFAR' in str(factory.cls):
        base_eps = 8. / 255.
        if base_eps_iter is None:
            base_eps_iter = 2. / 255.
    elif 'MNIST' in str(factory.cls):
        base_eps = .3
        if base_eps_iter is None:
            base_eps_iter = .1
    else:
        raise NotImplementedError(str(factory.cls))

    mc_params = {
        'eps': base_eps * value_range,
        'eps_iter': base_eps_iter * value_range,
        'nb_iter': nb_iter,
        'clip_min': min_value,
        'clip_max': max_val
    }

    x_data, y_data = dataset.get_set(which_set)

    report = ConfidenceReport()

    semantic = Semantic(model, center, max_val, sess)
    mc = MaxConfidence(model, sess=sess)

    jobs = [('clean', None, None, None, False),
            ('Semantic', semantic, None, None, False),
            ('mc', mc, mc_params, mc_batch_size, True)]

    for job in jobs:
        name, attack, attack_params, job_batch_size, save_this_job = job
        if job_batch_size is None:
            job_batch_size = batch_size
        t1 = time.time()
        if save_advx and save_this_job:
            # If we want to save the adversarial examples to the filesystem, we need
            # to fetch all of them. Otherwise they're just computed one batch at a
            # time and discarded

            # The path to save to
            assert report_path.endswith('.joblib')
            advx_path = report_path[:-len('.joblib')] + '_advx_' + name + '.npy'

            # Fetch the adversarial examples
            x_data = run_attack(sess,
                                model,
                                x_data,
                                y_data,
                                attack,
                                attack_params,
                                batch_size=job_batch_size,
                                devices=devices)

            # Turn off the attack so `correctness_and_confidence` won't run it a
            # second time.
            attack = None
            attack_params = None

            # Save the adversarial examples
            np.save(advx_path, x_data)

        # Run correctness and confidence evaluation on adversarial examples
        packed = correctness_and_confidence(sess,
                                            model,
                                            x_data,
                                            y_data,
                                            batch_size=job_batch_size,
                                            devices=devices,
                                            attack=attack,
                                            attack_params=attack_params)
        t2 = time.time()
        print("Evaluation took", t2 - t1, "seconds")
        correctness, confidence = packed

        report[name] = ConfidenceReportEntry(correctness=correctness,
                                             confidence=confidence)

        print_stats(correctness, confidence, name)

    save(report_path, report)
Exemplo n.º 9
0
def make_confidence_report_bundled(filepath,
                                   train_start=TRAIN_START,
                                   train_end=TRAIN_END,
                                   test_start=TEST_START,
                                   test_end=TEST_END,
                                   which_set=WHICH_SET,
                                   recipe=RECIPE,
                                   report_path=REPORT_PATH,
                                   nb_iter=NB_ITER,
                                   base_eps=None,
                                   base_eps_iter=None,
                                   base_eps_iter_small=None,
                                   batch_size=BATCH_SIZE):
    """
  Load a saved model, gather its predictions, and save a confidence report.
  :param filepath: path to model to evaluate
  :param train_start: index of first training set example to use
  :param train_end: index of last training set example to use
  :param test_start: index of first test set example to use
  :param test_end: index of last test set example to use
  :param which_set: 'train' or 'test'
  :param nb_iter: int, number of iterations of attack algorithm
    (note that different recipes will use this differently,
     for example many will run two attacks, one with nb_iter
     iterations and one with 25X more)
  :param base_eps: float, epsilon parameter for threat model, on a scale of [0, 1].
    Inferred from the dataset if not specified.
  :param base_eps_iter: float, a step size used in different ways by different recipes.
    Typically the step size for a PGD attack.
    Inferred from the dataset if not specified.
  :param base_eps_iter_small: float, a second step size for a more fine-grained attack.
    Inferred from the dataset if not specified.
  :param batch_size: int, batch size
  """
    # Avoid circular import
    from src.FGSM.cleverhans.cleverhans import attack_bundling
    if callable(recipe):
        run_recipe = recipe
    else:
        run_recipe = getattr(attack_bundling, recipe)

    # Set logging level to see debug information
    set_log_level(logging.INFO)

    # Create TF session
    sess = tf.Session()

    assert filepath.endswith('.joblib')
    if report_path is None:
        report_path = filepath[:-len('.joblib')] + "_bundled_report.joblib"

    with sess.as_default():
        model = load(filepath)
    assert len(model.get_params()) > 0
    factory = model.dataset_factory
    factory.kwargs['train_start'] = train_start
    factory.kwargs['train_end'] = train_end
    factory.kwargs['test_start'] = test_start
    factory.kwargs['test_end'] = test_end
    dataset = factory()

    center = dataset.kwargs['center']
    if 'max_val' in factory.kwargs:
        max_value = factory.kwargs['max_val']
    elif hasattr(dataset, 'max_val'):
        max_value = dataset.max_val
    else:
        raise AttributeError("Can't find max_value specification")
    min_value = 0. - center * max_value
    value_range = max_value - min_value

    if 'CIFAR' in str(factory.cls):
        if base_eps is None:
            base_eps = 8. / 255.
        if base_eps_iter is None:
            base_eps_iter = 2. / 255.
        if base_eps_iter_small is None:
            base_eps_iter_small = 1. / 255.
    elif 'MNIST' in str(factory.cls):
        if base_eps is None:
            base_eps = .3
        if base_eps_iter is None:
            base_eps_iter = .1
        base_eps_iter_small = None
    else:
        # Note that it is not required to specify base_eps_iter_small
        if base_eps is None or base_eps_iter is None:
            raise NotImplementedError("Not able to infer threat model from " +
                                      str(factory.cls))

    eps = base_eps * value_range
    eps_iter = base_eps_iter * value_range
    if base_eps_iter_small is None:
        eps_iter_small = None
    else:
        eps_iter_small = base_eps_iter_small * value_range
    clip_min = min_value
    clip_max = max_value

    x_data, y_data = dataset.get_set(which_set)
    assert x_data.max() <= max_value
    assert x_data.min() >= min_value

    assert eps_iter <= eps
    assert eps_iter_small is None or eps_iter_small <= eps

    # Different recipes take different arguments.
    # For now I don't have an idea for a beautiful unifying framework, so
    # we get an if statement.
    if recipe == 'random_search_max_confidence_recipe':
        # pylint always checks against the default recipe here
        # pylint: disable=no-value-for-parameter
        run_recipe(sess=sess,
                   model=model,
                   x=x_data,
                   y=y_data,
                   eps=eps,
                   clip_min=clip_min,
                   clip_max=clip_max,
                   report_path=report_path)
    else:
        run_recipe(sess=sess,
                   model=model,
                   x=x_data,
                   y=y_data,
                   nb_classes=dataset.NB_CLASSES,
                   eps=eps,
                   clip_min=clip_min,
                   clip_max=clip_max,
                   eps_iter=eps_iter,
                   nb_iter=nb_iter,
                   report_path=report_path,
                   eps_iter_small=eps_iter_small,
                   batch_size=batch_size)
Exemplo n.º 10
0
def make_confidence_report_spsa(filepath, train_start=TRAIN_START,
                                train_end=TRAIN_END,
                                test_start=TEST_START, test_end=TEST_END,
                                batch_size=BATCH_SIZE, which_set=WHICH_SET,
                                report_path=REPORT_PATH,
                                nb_iter=NB_ITER_SPSA,
                                spsa_samples=SPSA_SAMPLES,
                                spsa_iters=SPSA.DEFAULT_SPSA_ITERS):
  """
  Load a saved model, gather its predictions, and save a confidence report.


  This function works by running a single MaxConfidence attack on each example,
  using SPSA as the underyling optimizer.
  This is not intended to be a strong generic attack.
  It is intended to be a test to uncover gradient masking.

  :param filepath: path to model to evaluate
  :param train_start: index of first training set example to use
  :param train_end: index of last training set example to use
  :param test_start: index of first test set example to use
  :param test_end: index of last test set example to use
  :param batch_size: size of evaluation batches
  :param which_set: 'train' or 'test'
  :param nb_iter: Number of iterations of PGD to run per class
  :param spsa_samples: Number of samples for SPSA
  """

  # Set TF random seed to improve reproducibility
  tf.set_random_seed(1234)

  # Set logging level to see debug information
  set_log_level(logging.INFO)

  # Create TF session
  sess = tf.Session()

  if report_path is None:
    assert filepath.endswith('.joblib')
    report_path = filepath[:-len('.joblib')] + "_spsa_report.joblib"

  with sess.as_default():
    model = load(filepath)
  assert len(model.get_params()) > 0
  factory = model.dataset_factory
  factory.kwargs['train_start'] = train_start
  factory.kwargs['train_end'] = train_end
  factory.kwargs['test_start'] = test_start
  factory.kwargs['test_end'] = test_end
  dataset = factory()

  center = dataset.kwargs['center']
  center = np.float32(center)
  max_val = dataset.kwargs['max_val']
  max_val = np.float32(max_val)
  value_range = max_val * (1. + center)
  min_value = np.float32(0. - center * max_val)

  if 'CIFAR' in str(factory.cls):
    base_eps = 8. / 255.
  elif 'MNIST' in str(factory.cls):
    base_eps = .3
  else:
    raise NotImplementedError(str(factory.cls))

  eps = np.float32(base_eps * value_range)
  clip_min = min_value
  clip_max = max_val

  x_data, y_data = dataset.get_set(which_set)

  nb_classes = dataset.NB_CLASSES

  spsa_max_confidence_recipe(sess, model, x_data, y_data, nb_classes, eps,
                             clip_min, clip_max, nb_iter, report_path,
                             spsa_samples=spsa_samples,
                             spsa_iters=spsa_iters,
                             eval_batch_size=batch_size)
Exemplo n.º 11
0
def mnist_tutorial_cw(train_start=0,
                      train_end=60000,
                      test_start=0,
                      test_end=10000,
                      viz_enabled=VIZ_ENABLED,
                      nb_epochs=NB_EPOCHS,
                      batch_size=BATCH_SIZE,
                      source_samples=SOURCE_SAMPLES,
                      learning_rate=LEARNING_RATE,
                      attack_iterations=ATTACK_ITERATIONS,
                      model_path=MODEL_PATH,
                      targeted=TARGETED):
    """
  MNIST tutorial for Carlini and Wagner's attack
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :param viz_enabled: (boolean) activate plots of adversarial examples
  :param nb_epochs: number of epochs to train model
  :param batch_size: size of training batches
  :param nb_classes: number of output classes
  :param source_samples: number of test inputs to attack
  :param learning_rate: learning rate for training
  :param model_path: path to the model file
  :param targeted: should we run a targeted attack? or untargeted?
  :return: an AccuracyReport object
  """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    mnist = MNIST(train_start=train_start,
                  train_end=train_end,
                  test_start=test_start,
                  test_end=test_end)
    x_train, y_train = mnist.get_set('train')
    x_test, y_test = mnist.get_set('test')

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))
    nb_filters = 64

    # Define TF model graph
    model = ModelBasicCNN('model1', nb_classes, nb_filters)
    preds = model.get_logits(x)
    loss = CrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'filename': os.path.split(model_path)[-1]
    }

    rng = np.random.RandomState([2017, 8, 30])
    # check if we've trained before, and if we have, use that pre-trained model
    if os.path.exists(model_path + ".meta"):
        tf_model_load(sess, model_path)
    else:
        train(sess, loss, x_train, y_train, args=train_params, rng=rng)
        saver = tf.train.Saver()
        saver.save(sess, model_path)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using Carlini and Wagner's approach
    ###########################################################################
    nb_adv_per_sample = str(nb_classes - 1) if targeted else '1'
    print('Crafting ' + str(source_samples) + ' * ' + nb_adv_per_sample +
          ' adversarial examples')
    print("This could take some time ...")

    # Instantiate a CW attack object
    cw = CarliniWagnerL2(model, sess=sess)

    if viz_enabled:
        assert source_samples == nb_classes
        idxs = [
            np.where(np.argmax(y_test, axis=1) == i)[0][0]
            for i in range(nb_classes)
        ]
    if targeted:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, nb_classes, img_rows, img_cols,
                          nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            adv_inputs = np.array([[instance] * nb_classes
                                   for instance in x_test[idxs]],
                                  dtype=np.float32)
        else:
            adv_inputs = np.array([[instance] * nb_classes
                                   for instance in x_test[:source_samples]],
                                  dtype=np.float32)

        one_hot = np.zeros((nb_classes, nb_classes))
        one_hot[np.arange(nb_classes), np.arange(nb_classes)] = 1

        adv_inputs = adv_inputs.reshape(
            (source_samples * nb_classes, img_rows, img_cols, nchannels))
        adv_ys = np.array([one_hot] * source_samples,
                          dtype=np.float32).reshape(
                              (source_samples * nb_classes, nb_classes))
        yname = "y_target"
    else:
        if viz_enabled:
            # Initialize our array for grid visualization
            grid_shape = (nb_classes, 2, img_rows, img_cols, nchannels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            adv_inputs = x_test[idxs]
        else:
            adv_inputs = x_test[:source_samples]

        adv_ys = None
        yname = "y"

    if targeted:
        cw_params_batch_size = source_samples * nb_classes
    else:
        cw_params_batch_size = source_samples
    cw_params = {
        'binary_search_steps': 1,
        yname: adv_ys,
        'max_iterations': attack_iterations,
        'learning_rate': CW_LEARNING_RATE,
        'batch_size': cw_params_batch_size,
        'initial_const': 10
    }

    adv = cw.generate_np(adv_inputs, **cw_params)

    eval_params = {'batch_size': np.minimum(nb_classes, source_samples)}
    if targeted:
        adv_accuracy = model_eval(sess,
                                  x,
                                  y,
                                  preds,
                                  adv,
                                  adv_ys,
                                  args=eval_params)
    else:
        if viz_enabled:
            err = model_eval(sess,
                             x,
                             y,
                             preds,
                             adv,
                             y_test[idxs],
                             args=eval_params)
            adv_accuracy = 1 - err
        else:
            err = model_eval(sess,
                             x,
                             y,
                             preds,
                             adv,
                             y_test[:source_samples],
                             args=eval_params)
            adv_accuracy = 1 - err

    if viz_enabled:
        for j in range(nb_classes):
            if targeted:
                for i in range(nb_classes):
                    grid_viz_data[i, j] = adv[i * nb_classes + j]
            else:
                grid_viz_data[j, 0] = adv_inputs[j]
                grid_viz_data[j, 1] = adv[j]

        print(grid_viz_data.shape)

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    print('Avg. rate of successful adv. examples {0:.4f}'.format(adv_accuracy))
    report.clean_train_adv_eval = 1. - adv_accuracy

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(
        np.sum((adv - adv_inputs)**2, axis=(1, 2, 3))**.5)
    print('Avg. L_2 norm of perturbations {0:.4f}'.format(percent_perturbed))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        _ = grid_visual(grid_viz_data)

    return report
Exemplo n.º 12
0
def mnist_blackbox(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_classes=NB_CLASSES,
                   batch_size=BATCH_SIZE,
                   learning_rate=LEARNING_RATE,
                   nb_epochs=NB_EPOCHS,
                   holdout=HOLDOUT,
                   data_aug=DATA_AUG,
                   nb_epochs_s=NB_EPOCHS_S,
                   lmbda=LMBDA,
                   aug_batch_size=AUG_BATCH_SIZE):
    """
  MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :return: a dictionary with:
           * black-box model accuracy on test set
           * substitute model accuracy on test set
           * black-box model accuracy on adversarial examples transferred
             from the substitute model
  """

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Dictionary used to keep track and return key accuracies
    accuracies = {}

    # Perform tutorial setup
    assert setup_tutorial()

    # Create TF session
    sess = tf.Session()

    # Get MNIST data
    mnist = MNIST(train_start=train_start,
                  train_end=train_end,
                  test_start=test_start,
                  test_end=test_end)
    x_train, y_train = mnist.get_set('train')
    x_test, y_test = mnist.get_set('test')

    # Initialize substitute training set reserved for adversary
    x_sub = x_test[:holdout]
    y_sub = np.argmax(y_test[:holdout], axis=1)

    # Redefine test set as remaining samples unavailable to adversaries
    x_test = x_test[holdout:]
    y_test = y_test[holdout:]

    # Obtain Image parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Seed random number generator so tutorial is reproducible
    rng = np.random.RandomState([2017, 8, 30])

    # Simulate the black-box model locally
    # You could replace this by a remote labeling API for instance
    print("Preparing the black-box model.")
    prep_bbox_out = prep_bbox(sess, x, y, x_train, y_train, x_test, y_test,
                              nb_epochs, batch_size, learning_rate, rng,
                              nb_classes, img_rows, img_cols, nchannels)
    model, bbox_preds, accuracies['bbox'] = prep_bbox_out

    # Train substitute using method from https://arxiv.org/abs/1602.02697
    print("Training the substitute model.")
    train_sub_out = train_sub(sess, x, y, bbox_preds, x_sub, y_sub, nb_classes,
                              nb_epochs_s, batch_size, learning_rate, data_aug,
                              lmbda, aug_batch_size, rng, img_rows, img_cols,
                              nchannels)
    model_sub, preds_sub = train_sub_out

    # Evaluate the substitute model on clean test examples
    eval_params = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_sub, x_test, y_test, args=eval_params)
    accuracies['sub'] = acc

    # Initialize the Fast Gradient Sign Method (FGSM) attack object.
    fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.}
    fgsm = FastGradientMethod(model_sub, sess=sess)

    # Craft adversarial examples using the substitute
    eval_params = {'batch_size': batch_size}
    x_adv_sub = fgsm.generate(x, **fgsm_par)

    # Evaluate the accuracy of the "black-box" model on adversarial examples
    accuracy = model_eval(sess,
                          x,
                          y,
                          model.get_logits(x_adv_sub),
                          x_test,
                          y_test,
                          args=eval_params)
    print('Test accuracy of oracle on adversarial examples generated '
          'using the substitute: ' + str(accuracy))
    accuracies['bbox_on_sub_adv_ex'] = accuracy

    return accuracies
def mnist_tutorial(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=NB_EPOCHS,
                   batch_size=BATCH_SIZE,
                   learning_rate=LEARNING_RATE,
                   clean_train=CLEAN_TRAIN,
                   testing=False,
                   backprop_through_attack=BACKPROP_THROUGH_ATTACK,
                   nb_filters=NB_FILTERS,
                   num_threads=None,
                   label_smoothing=0.1):
    """
  MNIST cleverhans tutorial
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :param nb_epochs: number of epochs to train model
  :param batch_size: size of training batches
  :param learning_rate: learning rate for training
  :param clean_train: perform normal training on clean examples only
                      before performing adversarial training.
  :param testing: if true, complete an AccuracyReport for unit tests
                  to verify that performance is adequate
  :param backprop_through_attack: If True, backprop through adversarial
                                  example construction process during
                                  adversarial training.
  :param label_smoothing: float, amount of label smoothing for cross entropy
  :return: an AccuracyReport object
  """

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Create TF session
    if num_threads:
        config_args = dict(intra_op_parallelism_threads=1)
    else:
        config_args = {}
    sess = tf.Session(config=tf.ConfigProto(**config_args))

    # Get MNIST test data
    mnist = MNIST(train_start=train_start,
                  train_end=train_end,
                  test_start=test_start,
                  test_end=test_end)
    x_train, y_train = mnist.get_set('train')
    x_test, y_test = mnist.get_set('test')

    # Use Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    eval_params = {'batch_size': batch_size}
    fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}
    rng = np.random.RandomState([2017, 8, 30])

    def do_eval(preds, x_set, y_set, report_key, is_adv=None):
        """
    Run the evaluation and print the results.
    """
        acc = model_eval(sess, x, y, preds, x_set, y_set, args=eval_params)
        setattr(report, report_key, acc)
        if is_adv is None:
            report_text = None
        elif is_adv:
            report_text = 'adversarial'
        else:
            report_text = 'legitimate'
        if report_text:
            print('Test accuracy on %s examples: %0.4f' % (report_text, acc))

    if clean_train:
        model = make_basic_picklable_cnn()
        # Tag the model so that when it is saved to disk, future scripts will
        # be able to tell what data it was trained on
        model.dataset_factory = mnist.get_factory()
        preds = model.get_logits(x)
        assert len(model.get_params()) > 0
        loss = CrossEntropy(model, smoothing=label_smoothing)

        def evaluate():
            """
      Run evaluation for the naively trained model on clean examples.
      """
            do_eval(preds, x_test, y_test, 'clean_train_clean_eval', False)

        train(sess,
              loss,
              x_train,
              y_train,
              evaluate=evaluate,
              args=train_params,
              rng=rng,
              var_list=model.get_params())

        with sess.as_default():
            save("clean_model.joblib", model)

            print("Now that the model has been saved, you can evaluate it in a"
                  " separate process using `evaluate_pickled_model.py`. "
                  "You should get exactly the same result for both clean and "
                  "adversarial accuracy as you get within this program.")

        # Calculate training error
        if testing:
            do_eval(preds, x_train, y_train, 'train_clean_train_clean_eval')

        # Initialize the Fast Gradient Sign Method (FGSM) attack object and
        # graph
        fgsm = FastGradientMethod(model, sess=sess)
        adv_x = fgsm.generate(x, **fgsm_params)
        preds_adv = model.get_logits(adv_x)

        # Evaluate the accuracy of the MNIST model on adversarial examples
        do_eval(preds_adv, x_test, y_test, 'clean_train_adv_eval', True)

        # Calculate training error
        if testing:
            do_eval(preds_adv, x_train, y_train, 'train_clean_train_adv_eval')

        print('Repeating the process, using adversarial training')

    # Create a new model and train it to be robust to FastGradientMethod
    model2 = make_basic_picklable_cnn()
    # Tag the model so that when it is saved to disk, future scripts will
    # be able to tell what data it was trained on
    model2.dataset_factory = mnist.get_factory()
    fgsm2 = FastGradientMethod(model2, sess=sess)

    def attack(x):
        """Return an adversarial example near clean example `x`"""
        return fgsm2.generate(x, **fgsm_params)

    loss2 = CrossEntropy(model2, smoothing=label_smoothing, attack=attack)
    preds2 = model2.get_logits(x)
    adv_x2 = attack(x)

    if not backprop_through_attack:
        # For the fgsm attack used in this tutorial, the attack has zero
        # gradient so enabling this flag does not change the gradient.
        # For some other attacks, enabling this flag increases the cost of
        # training, but gives the defender the ability to anticipate how
        # the atacker will change their strategy in response to updates to
        # the defender's parameters.
        adv_x2 = tf.stop_gradient(adv_x2)
    preds2_adv = model2.get_logits(adv_x2)

    def evaluate_adv():
        """
    Evaluate the adversarially trained model.
    """
        # Accuracy of adversarially trained model on legitimate test inputs
        do_eval(preds2, x_test, y_test, 'adv_train_clean_eval', False)
        # Accuracy of the adversarially trained model on adversarial examples
        do_eval(preds2_adv, x_test, y_test, 'adv_train_adv_eval', True)

    # Perform and evaluate adversarial training
    train(sess,
          loss2,
          x_train,
          y_train,
          evaluate=evaluate_adv,
          args=train_params,
          rng=rng,
          var_list=model2.get_params())

    with sess.as_default():
        save("adv_model.joblib", model2)
        print(
            "Now that the model has been saved, you can evaluate it in a "
            "separate process using "
            "`python evaluate_pickled_model.py adv_model.joblib`. "
            "You should get exactly the same result for both clean and "
            "adversarial accuracy as you get within this program."
            " You can also move beyond the tutorials directory and run the "
            " real `compute_accuracy.py` script (make sure cleverhans/scripts "
            "is in your PATH) to see that this FGSM-trained "
            "model is actually not very robust---it's just a model that trains "
            " quickly so the tutorial does not take a long time")

    # Calculate training errors
    if testing:
        do_eval(preds2, x_train, y_train, 'train_adv_train_clean_eval')
        do_eval(preds2_adv, x_train, y_train, 'train_adv_train_adv_eval')

    return report
Exemplo n.º 14
0
def main(argv):

    model_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)

    if model_file is None:
        print('No model found')
        sys.exit()

    cifar = cifar10_input.CIFAR10Data(FLAGS.dataset_dir)

    nb_classes = 10
    X_test = cifar.eval_data.xs
    Y_test = to_categorical(cifar.eval_data.ys, nb_classes)
    assert Y_test.shape[1] == 10.

    set_log_level(logging.DEBUG)

    with tf.Session() as sess:

        x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
        y = tf.placeholder(tf.float32, shape=(None, 10))

        from src.FGSM.cleverhans.cleverhans.model_zoo.madry_lab_challenges.cifar10_model import make_wresnet
        model = make_wresnet()

        saver = tf.train.Saver()

        # Restore the checkpoint
        saver.restore(sess, model_file)

        nb_samples = FLAGS.nb_samples

        attack_params = {
            'batch_size': FLAGS.batch_size,
            'clip_min': 0.,
            'clip_max': 255.
        }

        if FLAGS.attack_type == 'cwl2':
            from src.FGSM.cleverhans.cleverhans.attacks import CarliniWagnerL2
            attacker = CarliniWagnerL2(model, sess=sess)
            attack_params.update({
                'binary_search_steps': 1,
                'max_iterations': 100,
                'learning_rate': 0.1,
                'initial_const': 10,
                'batch_size': 10
            })

        else:  # eps and eps_iter in range 0-255
            attack_params.update({'eps': 8, 'ord': np.inf})
            if FLAGS.attack_type == 'fgsm':
                from src.FGSM.cleverhans.cleverhans.attacks import FastGradientMethod
                attacker = FastGradientMethod(model, sess=sess)

            elif FLAGS.attack_type == 'pgd':
                attack_params.update({'eps_iter': 2, 'nb_iter': 20})
                from src.FGSM.cleverhans.cleverhans.attacks import MadryEtAl
                attacker = MadryEtAl(model, sess=sess)

        eval_par = {'batch_size': FLAGS.batch_size}

        if FLAGS.sweep:
            max_eps = 16
            epsilons = np.linspace(1, max_eps, max_eps)
            for e in epsilons:
                t1 = time.time()
                attack_params.update({'eps': e})
                x_adv = attacker.generate(x, **attack_params)
                preds_adv = model.get_probs(x_adv)
                acc = model_eval(sess,
                                 x,
                                 y,
                                 preds_adv,
                                 X_test[:nb_samples],
                                 Y_test[:nb_samples],
                                 args=eval_par)
                print('Epsilon %.2f, accuracy on adversarial' % e,
                      'examples %0.4f\n' % acc)
            t2 = time.time()
        else:
            t1 = time.time()
            x_adv = attacker.generate(x, **attack_params)
            preds_adv = model.get_probs(x_adv)
            acc = model_eval(sess,
                             x,
                             y,
                             preds_adv,
                             X_test[:nb_samples],
                             Y_test[:nb_samples],
                             args=eval_par)
            t2 = time.time()
            print('Test accuracy on adversarial examples %0.4f\n' % acc)
        print("Took", t2 - t1, "seconds")