def __init__(self,
                 model,
                 targeted=False,
                 step_size_iter=0.05,
                 max_perturbation=0.3,
                 n_iterations=10,
                 norm_order=np.inf,
                 rand_init=None,
                 rand_minmax=0.3,
                 clip_min=None,
                 clip_max=None,
                 sanity_checks=True):
        super().__init__(model=model, clip_min=clip_min, clip_max=clip_max)
        self._targeted = targeted
        self._step_size_iter = step_size_iter
        self._max_perturbation = max_perturbation
        self._n_iterations = n_iterations
        self._norm_order = norm_order
        self._rand_init = rand_init
        self._rand_minmax = rand_minmax
        self._sanity_checks = sanity_checks

        with self.graph.as_default():
            self._method = ProjectedGradientDescent(
                self._model,
                sess=self.session,
                eps=self._max_perturbation,
                eps_iter=self._step_size_iter,
                nb_iter=self._n_iterations,
                ord=self._norm_order,
                rand_init=self._rand_init,
                clip_min=self._clip_min,
                clip_max=self._clip_max,
                sanity_checks=self._sanity_checks)
Exemplo n.º 2
0
def get_at_loss(sess, x, y, model, eps, eps_iter, iterations):
    # Set up PGD attack graph using Cleverhans library

    pgd_params = {
        'ord': np.inf,
        'y': y,
        'eps': eps / 255,
        'eps_iter': eps_iter / 255,
        'nb_iter': iterations,
        'rand_init': True,
        'rand_minmax': eps / 255,
        'clip_min': 0.,
        'clip_max': 1.,
        'sanity_checks': True
    }

    pgd = ProjectedGradientDescent(model, sess=sess)
    adv_x = pgd.generate(x, **pgd_params)
    adv_logits = model.get_logits(adv_x)

    # Add summary for adversarial training images
    with tf.device('/gpu:0'):
        with tf.name_scope('Adversarial-Image-Summaries'):
            tf.summary.image('adv-input',
                             adv_x,
                             max_outputs=2,
                             family='Adversarial-Training',
                             collections=['training'])

    adv_loss = tf.nn.softmax_cross_entropy_with_logits(logits=adv_logits,
                                                       labels=y)
    adv_loss = tf.reduce_mean(adv_loss)

    return adv_loss, adv_logits
Exemplo n.º 3
0
    def __init__(self, dataset, model):
        super(PGDAdaptor, self).__init__(dataset, model)

        self.config = tf.ConfigProto(gpu_options=tf.GPUOptions(
            per_process_gpu_memory_fraction=0.5))
        self.config.gpu_options.allow_growth = True
        self.graph = tf.Graph()
        self.sess = tf.Session(graph=self.graph, config=self.config)

        input_shape = get_input_shape(dataset)

        with self.sess.graph.as_default():
            with self.sess.as_default():
                self.tf_model = convert_pytorch_model_to_tf(self.model)
                self.ch_model = CallableModelWrapper(self.tf_model,
                                                     output_layer='logits')

                self.x_op = tf.placeholder(tf.float32,
                                           shape=(
                                               None,
                                               input_shape[0],
                                               input_shape[1],
                                               input_shape[2],
                                           ))
                self.attk = ProjectedGradientDescent(self.ch_model,
                                                     sess=self.sess)

        self.adv_preds_ops = dict()
Exemplo n.º 4
0
 def __init__( self, sess, in_tensor_name, out_tensor_name, mean=None, std=None ):
     callable_model = CleverHansWrapperWrapper( sess, in_tensor_name, out_tensor_name, mean, std )
     cleverhans_model = CallableModelWrapper( callable_model, 'logits' )
     self.sess = sess
     self.attack = ProjectedGradientDescent( cleverhans_model, sess=sess )
     self.output_ten = tf.get_default_graph().get_tensor_by_name( out_tensor_name )
     self.input_ten = tf.get_default_graph().get_tensor_by_name( in_tensor_name )
     self.output_shape = [ dim.value for dim in self.output_ten.shape ]
     self.mean = mean
     self.std = std
Exemplo n.º 5
0
def attack_images(model, tfrecords_dirpath, attack_type='PGD', attack_kwargs=default_attack_kwargs):
    '''
    Attack images (batch = 1 for now)
    '''

    # Get the true label
    true_label = attack_kwargs['y']
    attack_label = attack_kwargs['y_target']
    del attack_kwargs['y']

    # Define tfrecords input iterator
    tfrecord_filepaths = glob(os.path.join(tfrecords_dirpath, '*'))
    tf_dataset = tfutils.make_dataset(
                    tfrecord_filepaths,
                    batch_size=1,
                    filter_label=true_label,
                    preprocessing_fn=preprocess_input
                )
    iterator = tf_dataset.make_one_shot_iterator()
    x, y = iterator.get_next()

    # Run the Session
    attacked_imgs = []
    with tf.Session() as sess:

        # Set attack settings
        # PGD
        if attack_type == "PGD":
            attack = ProjectedGradientDescent(model, sess=sess)
        # FGM
        elif attack_type == "FGM":
            attack = FastGradientMethod(model, sess=sess)
        target_one_hot_encoded = get_one_hot_encoded_targets(attack_label)
        attack_kwargs['y_target'] = target_one_hot_encoded

        # Run the session to generate attacked images
        x_adv = attack.generate(x, **attack_kwargs)
        pbar = tqdm(unit='imgs')
        try:
            while True:
                attacked_img = sess.run(x_adv)
                predicted_class = get_predictions(model, attacked_img)
                print(predicted_class, attack_label)
                if predicted_class == attack_label:
                    attacked_imgs.append(attacked_img)
                pbar.update()
        except tf.errors.OutOfRangeError:
            pass

    if len(attacked_imgs) > 0:
        attacked_imgs = np.vstack(attacked_imgs)
    return attacked_imgs
    def _get_pert(self, X, Y, eps: float, model):
        x = tf.placeholder(tf.float32, shape=([None] + list(self.n_features)))
        y = tf.placeholder(tf.float32, shape=(None, self.n_classes))

        wrap = KerasModelWrapper(model)
        pgd = ProjectedGradientDescent(wrap, ord=self.ord, sess=self.sess)
        if eps >= 0.05:
            adv_x = pgd.generate(x, y=y, eps=eps)
        else:
            adv_x = pgd.generate(x, y=y, eps=eps, eps_iter=eps)
        adv_x = tf.stop_gradient(adv_x)
        ret = adv_x - x
        return ret.eval(feed_dict={x: X, y: Y}, session=self.sess)
Exemplo n.º 7
0
def pgd_attack():
    # Use tf for evaluation on adversarial data
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    sess = tf.Session(config=tf_config)
    x_op = tf.placeholder(tf.float32, shape=(
        None,
        3,
        32,
        32,
    ))
    y_op = tf.placeholder(tf.float32, shape=(None, 10))

    # Convert pytorch model to a tf_model and wrap it in cleverhans
    tf_model_fn = convert_pytorch_model_to_tf(model)
    cleverhans_model = CallableModelWrapper(tf_model_fn, output_layer='logits')

    # Create an PGD attack
    pgd = ProjectedGradientDescent(cleverhans_model, sess=sess)
    pgd_params = {
        'eps': args.eps,
        'eps_iter': args.ss,
        'nb_iter': args.ns,
        'clip_min': 0.,
        'clip_max': 1.,
        'y': y_op
    }

    adv_x_op = pgd.generate(x_op, **pgd_params)
    adv_preds_op = tf_model_fn(adv_x_op)

    # Evaluation against PGD attacks
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(test_loader):
        adv_preds = sess.run(adv_preds_op,
                             feed_dict={
                                 x_op: inputs,
                                 y_op:
                                 torch.nn.functional.one_hot(targets, 10)
                             })
        correct += (np.argmax(adv_preds, axis=1) == targets.numpy()).sum()
        total += len(inputs)

        sys.stdout.write("\rWhite-box PGD attack... Acc: %.3f%% (%d/%d)" %
                         (100. * correct / total, correct, total))
        sys.stdout.flush()

    print('Accuracy under PGD attack: %.3f%%' % (100. * correct / total))
class PGDAttack(AdversarialAttack):
    def __init__(self,
                 model,
                 targeted=False,
                 step_size_iter=0.05,
                 max_perturbation=0.3,
                 n_iterations=10,
                 norm_order=np.inf,
                 rand_init=None,
                 rand_minmax=0.3,
                 clip_min=None,
                 clip_max=None,
                 sanity_checks=True):
        super().__init__(model=model, clip_min=clip_min, clip_max=clip_max)
        self._targeted = targeted
        self._step_size_iter = step_size_iter
        self._max_perturbation = max_perturbation
        self._n_iterations = n_iterations
        self._norm_order = norm_order
        self._rand_init = rand_init
        self._rand_minmax = rand_minmax
        self._sanity_checks = sanity_checks

        with self.graph.as_default():
            self._method = ProjectedGradientDescent(
                self._model,
                sess=self.session,
                eps=self._max_perturbation,
                eps_iter=self._step_size_iter,
                nb_iter=self._n_iterations,
                ord=self._norm_order,
                rand_init=self._rand_init,
                clip_min=self._clip_min,
                clip_max=self._clip_max,
                sanity_checks=self._sanity_checks)

    def attack_method(self, labels):
        if labels is not None:
            if self._targeted:
                return self._method.generate(x=self._x_clean,
                                             y_target=labels,
                                             rand_minmax=self._rand_minmax)
            else:
                return self._method.generate(x=self._x_clean,
                                             y=labels,
                                             rand_minmax=self._rand_minmax)
        return self._method.generate(x=self._x_clean,
                                     rand_minmax=self._rand_minmax)
    def _get_pert(self, X, Y, eps):
        if eps == 0:
            return np.zeros_like(X)
        with self.sess.as_default():
            self.x = self.wrap.input

            pgd = ProjectedGradientDescent(self.x, sess=self.sess)
            adv_x = pgd.generate(self.x,
                                 y=self.y,
                                 eps=eps,
                                 ord=self.ord,
                                 eps_iter=0.01)
            adv_x = tf.stop_gradient(adv_x)
            pert_x = adv_x - self.x

            feed_dict = {self.x: X, self.y: Y}
            ret = pert_x.eval(feed_dict=feed_dict)
        return ret
Exemplo n.º 10
0
def evaluate_checkpoint(filename):
    if attack_method == 'BIM':
        bim = BasicIterativeMethod(model)
        bim_params = {
            'eps': 0.3,
            'clip_min': 0.,
            'clip_max': 1.,
            'nb_iter': 50,
            'eps_iter': .01
        }
        adv_x = bim.generate(x_image, **bim_params)
    elif attack_method == 'FGM':
        FGM_attack = FastGradientMethod(model)
        FGM_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}
        adv_x = FGM_attack.generate(x_image, **FGM_params)
    elif attack_method == 'PGD':
        pgd = ProjectedGradientDescent(model)
        pgd_params = {
            'eps': 0.09,
            'clip_min': 0.,
            'clip_max': 1.,
            'nb_iter': 40,
            'eps_iter': .01
        }
        adv_x = pgd.generate(x_image, **pgd_params)
    preds_adv = model.get_probs(adv_x)

    with tf.Session() as sess:
        # Restore the checkpoint
        saver = tf.train.Saver(var_list=model.all_variables)
        saver.restore(sess, filename)

        eval_par = {'batch_size': batch_size}
        t1 = time.time()
        acc = model_eval(sess,
                         x_image,
                         y,
                         preds_adv,
                         X_test,
                         Y_test,
                         args=eval_par)
        t2 = time.time()
        print("Took", t2 - t1, "seconds")
        print('Test accuracy on adversarial examples: %0.4f\n' % acc)
Exemplo n.º 11
0
def single_run_max_confidence_recipe(sess, model, x, y, nb_classes, eps,
                                     clip_min, clip_max, eps_iter, nb_iter,
                                     report_path,
                                     batch_size=BATCH_SIZE):
  """A reasonable attack bundling recipe for a max norm threat model and
  a defender that uses confidence thresholding. This recipe uses both
  uniform noise and randomly-initialized PGD targeted attacks.

  References:
  https://openreview.net/forum?id=H1g0piA9tQ

  This version runs each attack (noise, targeted PGD for each class with
  nb_iter iterations, target PGD for each class with 25X more iterations)
  just once and then stops. See `basic_max_confidence_recipe` for a version
  that runs indefinitely.

  :param sess: tf.Session
  :param model: cleverhans.model.Model
  :param x: numpy array containing clean example inputs to attack
  :param y: numpy array containing true labels
  :param nb_classes: int, number of classes
  :param eps: float, maximum size of perturbation (measured by max norm)
  :param eps_iter: float, step size for one version PGD attacks
    (will also run another version with 25X smaller step size)
  :param nb_iter: int, number of iterations for the cheaper PGD attacks
    (will also run another version with 25X more iterations)
  :param report_path: str, the path that the report will be saved to.
  :batch_size: int, the total number of examples to run simultaneously
  """
  noise_attack = Noise(model, sess)
  pgd_attack = ProjectedGradientDescent(model, sess)
  threat_params = {"eps": eps, "clip_min" : clip_min, "clip_max" : clip_max}
  noise_attack_config = AttackConfig(noise_attack, threat_params, "noise")
  attack_configs = [noise_attack_config]
  pgd_attack_configs = []
  pgd_params = copy.copy(threat_params)
  pgd_params["eps_iter"] = eps_iter
  pgd_params["nb_iter"] = nb_iter
  assert batch_size % num_devices == 0
  dev_batch_size = batch_size // num_devices
  ones = tf.ones(dev_batch_size, tf.int32)
  expensive_pgd = []
  for cls in range(nb_classes):
    cls_params = copy.copy(pgd_params)
    cls_params['y_target'] = tf.to_float(tf.one_hot(ones * cls, nb_classes))
    cls_attack_config = AttackConfig(pgd_attack, cls_params, "pgd_" + str(cls))
    pgd_attack_configs.append(cls_attack_config)
    expensive_params = copy.copy(cls_params)
    expensive_params["eps_iter"] /= 25.
    expensive_params["nb_iter"] *= 25.
    expensive_config = AttackConfig(pgd_attack, expensive_params, "expensive_pgd_" + str(cls))
    expensive_pgd.append(expensive_config)
  attack_configs = [noise_attack_config] + pgd_attack_configs + expensive_pgd
  new_work_goal = {config: 1 for config in attack_configs}
  goals = [MaxConfidence(t=1., new_work_goal=new_work_goal)]
  bundle_attacks(sess, model, x, y, attack_configs, goals, report_path)
Exemplo n.º 12
0
def build_attack(model, sess, eps=0.3, clip_min=0.0, clip_max=1.0):
    # Wrap model with cleverhans and init the attack method
    wrapped_model = KerasModelWrapper(model)
    pgd = ProjectedGradientDescent(wrapped_model, sess=sess)

    # Build acc and loss
    pgd_params = {"eps": eps, "clip_min": clip_min, "clip_max": clip_max}
    adv_acc_metric = get_adversarial_acc_metric(model, pgd, pgd_params)
    adv_loss = get_adversarial_loss(model, pgd, pgd_params)
    return pgd, adv_acc_metric, adv_loss
Exemplo n.º 13
0
def init_attack(model, attack_params_dict, sess):
    """
    Initialize the adversarial attack using the cleverhans toolbox

    Parameters
    ----------
    model : Keras Model
        The model to attack

    attack_params_dict : dict
        Self-defined dictionary specifying the attack and its parameters

    sess : Session
        The current tf session

    Returns
    -------
    attack : cleverhans Attack
        The Attack object

    attack_params
        Dictionary with the value of the attack parameters, valid to generate
        adversarial examples with cleverhans.
    """

    # Wrapper for the Keras model
    model_wrap = KerasModelWrapper(model)

    # Initialize attack
    batch_size = None
    if attack_params_dict['attack'] == 'fgsm':
        attack = FastGradientMethod(model_wrap, sess=sess)
        attack_params = {'eps': attack_params_dict['eps'], 'clip_min': 0., 
                         'clip_max': 1.}
    elif attack_params_dict['attack'] == 'spsa':
        attack = SPSA(model_wrap, sess=sess)
        attack_params = {'epsilon': attack_params_dict['eps'], 
                         'num_steps': attack_params_dict['n_steps']}
        batch_size = 1
    elif attack_params_dict['attack'] == 'deepfool':
        attack = DeepFool(model_wrap, sess=sess)
        attack_params = {'clip_min': 0., 'clip_max': 1.}
    elif attack_params_dict['attack'] == 'pgd':
        attack = ProjectedGradientDescent(model_wrap, sess=sess)
        attack_params = {'eps': attack_params_dict['eps'], 
                         'eps_iter': attack_params_dict['eps_iter'],
                         'nb_iter': attack_params_dict['n_steps'],
                         'clip_min': 0., 'clip_max': 1.}
    elif attack_params_dict['attack'] == 'carlini':
        attack = CarliniWagnerL2(model_wrap, sess=sess)
        attack_params = {'clip_min': 0., 'clip_max': 1.}
    else:
        raise NotImplementedError()

    return attack, attack_params, batch_size
Exemplo n.º 14
0
def eval_cleverhans():

    # Set test phase
    learning_phase = K.learning_phase()
    K.set_learning_phase(0)

    # Pre-process images
    images_tf = images.astype(K.floatx())
    images_tf /= 255.

    # Wrapper for the Keras model
    model_wrap = KerasModelWrapper(model)

    # Initialize attack
    if attack_params_dict['attack'] == 'fgsm':
        attack = FastGradientMethod(model_wrap, sess=K.get_session())
        attack_params = {'eps': attack_params_dict['eps'], 'clip_min': 0., 
                         'clip_max': 1.}
    elif attack_params_dict['attack'] == 'deepfool':
        attack = DeepFool(model_wrap, sess=K.get_session())
        attack_params = {'clip_min': 0., 'clip_max': 1.}
    elif attack_params_dict['attack'] == 'madry':
        attack = ProjectedGradientDescent(model_wrap, sess=K.get_session())
        attack_params = {'clip_min': 0., 'clip_max': 1.}
    elif attack_params_dict['attack'] == 'carlini':
        attack = CarliniWagnerL2(model_wrap, sess=K.get_session())
        attack_params = {'clip_min': 0., 'clip_max': 1.}
    else:
        raise NotImplementedError()

    # Define input TF placeholder
    x = tf.placeholder(K.floatx(), shape=(None,) + images.shape[1:])
    y = tf.placeholder(K.floatx(), shape=(None,) + (labels.shape[-1],))

    # Define adversarial predictions symbolically
    x_adv = attack.generate(x, **attack_params)
    x_adv = tf.stop_gradient(x_adv)
    predictions_adv = model(x_adv)

    # Evaluate the accuracy of the model on adversarial examples
    eval_par = {'batch_size': batch_size}
    # feed_dict = {K.learning_phase(): attack_params_dict['learning_phase']}
    # acc_adv = model_eval(K.get_session(), x, y, predictions_adv, images, 
    #                      labels, feed=feed_dict, args=eval_par)
    acc_adv = model_eval(K.get_session(), x, y, predictions_adv, images_tf, 
                         labels, args=eval_par)

    print('Aversarial accuracy against %s: %.4f\n' %
          (attack_params_dict['attack'], acc_adv))

    # Set original phase
    K.set_learning_phase(learning_phase)

    return acc_adv
Exemplo n.º 15
0
def get_alp_loss(sess, x, y, logits, adv_logits, model, eps, eps_iter,
                 iterations):
    if adv_logits is None:
        pgd_params = {
            'ord': np.inf,
            'y': y,
            'eps': eps / 255,
            'eps_iter': eps_iter / 255,
            'nb_iter': iterations,
            'rand_init': True,
            'rand_minmax': eps / 255,
            'clip_min': 0.,
            'clip_max': 1.,
            'sanity_checks': True
        }

        pgd = ProjectedGradientDescent(model, sess=sess)
        adv_x = pgd.generate(x, **pgd_params)
        adv_logits = model.get_logits(adv_x)

    adv_pairing_loss = tf.losses.mean_squared_error(logits, adv_logits)

    return adv_pairing_loss
Exemplo n.º 16
0
class PGDUtil(object):
    def __init__(self, model, sess, log_file="log.txt"):
        self.pgd = ProjectedGradientDescent(model=model, sess=sess)
        self.log_file = log_file

    def create_adversaries(self, x_train, y_train, i, nb_of_adv=None):
        if nb_of_adv is None:
            nb_of_adv = len(x_train)

        adv_train_x, clean_train_x, adv_train_y, clean_train_y = divide_into_clean_and_adversarial_set(
            x_train, y_train, nb_of_adv, i)
        adv_x = self.pgd.generate_np(adv_train_x)
        return numpy.vstack((adv_x, clean_train_x)), numpy.vstack(
            (adv_train_y, clean_train_y))
Exemplo n.º 17
0
def test_callable_no_softmax():
    batch_size = 2
    nb_classes = 3

    def model(x):
        return tf.ones((batch_size, nb_classes)) / nb_classes

    sess = tf.Session()
    attack = ProjectedGradientDescent(model, sess=sess)
    x = tf.ones((batch_size, 3))
    # Currently ProjectedGradientDescent treats the output of a callable
    # as probs rather than logits.
    # Since our callable does not use a softmax, it is impossible to get
    # the logits back. The test confirms that this causes an error.
    assert_raises(TypeError, attack.generate, x)
Exemplo n.º 18
0
def test_no_logits():
  """test_no_logits: Check that a model without logits causes an error"""
  batch_size = 2
  nb_classes = 3
  class NoLogitsModel(Model):
    """
    A model that neither defines logits nor makes it possible to find logits
    by inspecting the inputs to a softmax op.
    """
    def fprop(self, x, **kwargs):
      return {'probs': tf.ones((batch_size, nb_classes)) / nb_classes}
  model = NoLogitsModel()
  sess = tf.Session()
  attack = ProjectedGradientDescent(model, sess=sess)
  x = tf.ones((batch_size, 3))
  assert_raises(NotImplementedError, attack.generate, x)
Exemplo n.º 19
0
class PGDGenerator:
    def __init__( self, sess, in_tensor_name, out_tensor_name, mean=None, std=None ):
        callable_model = CleverHansWrapperWrapper( sess, in_tensor_name, out_tensor_name, mean, std )
        cleverhans_model = CallableModelWrapper( callable_model, 'logits' )
        self.sess = sess
        self.attack = ProjectedGradientDescent( cleverhans_model, sess=sess )
        self.output_ten = tf.get_default_graph().get_tensor_by_name( out_tensor_name )
        self.input_ten = tf.get_default_graph().get_tensor_by_name( in_tensor_name )
        self.output_shape = [ dim.value for dim in self.output_ten.shape ]
        self.mean = mean
        self.std = std

    def generate( self, img, **kwargs ):
        if 'y_target' in kwargs:
            y_target = kwargs[ 'y_target' ]
            if not isinstance( y_target, (np.ndarray, list) ):
                y_target = [ y_target ]
            target_arr = np.zeros(self.output_shape)
            target = y_target[ np.random.randint(0, len(y_target)) ] 
            target_arr[ 0, target ] = 1
            kwargs[ 'y_target' ] = target_arr
        if 'eps_iter_size' in kwargs and not 'eps_iter' in kwargs and  'eps' in kwargs:
            kwargs['eps_iter'] = kwargs['eps'] * kwargs['eps_iter_size']
            del kwargs['eps_iter_size']
        check = True
        if 'check' in kwargs:
            check = kwargs['check']
            del kwargs['check'] 
        adv_example = self.attack.generate_np( img, **kwargs )
        if not self.std is None:
            adv_example = adv_example.reshape( -1, 1 )
            adv_example = ( adv_example - self.mean ) / self.std
            adv_example = adv_example.reshape( -1 )
        #assert ( np.all( adv_example <= kwargs['clip_max'] + 0.000001) and np.all( adv_example >= kwargs['clip_min'] - 0.000001 ) )
        real_cl = np.argmax( self.sess.run( self.output_ten, feed_dict={ self.input_ten: adv_example } ) )
        if not check:
            return adv_example

        if 'y_target' in kwargs and real_cl in y_target:
            return adv_example
        if not 'y_target' in kwargs:
            correct_cl = np.argmax( self.sess.run( self.output_ten, feed_dict={ self.input_ten: img } ) )
            if not correct_cl == real_cl:
                return adv_example

        return None
Exemplo n.º 20
0
def train_child(t, p, m, load_dict=False):
    # model = nn.DataParallel(TestCNN().cuda(1), device_ids=[1, 2, 3])
    raw_model = TestCNN().cuda(0)
    model = TestCNN().cuda(0)
    tf_model = convert_pytorch_model_to_tf(model)
    cleverhans_model = CallableModelWrapper(tf_model, output_layer='logits')
    session = tf.Session()
    x_op = tf.placeholder(tf.float32, shape=(None, 3, 32, 32))
    fgsm = FastGradientMethod(cleverhans_model, sess=session)
    # stm = SpatialTransformationMethod(cleverhans_model, sess=session)
    # cw2 = CarliniWagnerL2(cleverhans_model, sess=session)
    pgd = ProjectedGradientDescent(cleverhans_model, sess=session)
    noise = Noise(cleverhans_model, sess=session)
    mim = MomentumIterativeMethod(cleverhans_model, sess=session)
    df = DeepFool(cleverhans_model, sess=session)
    tf_raw_model = convert_pytorch_model_to_tf(raw_model)
    cleverhans_raw_model = CallableModelWrapper(tf_raw_model,
                                                output_layer='logits')
    # pgd_raw = ProjectedGradientDescent(cleverhans_raw_model, sess=session)
    noise_raw = Noise(cleverhans_raw_model, sess=session)

    def fgsm_op(x, eps):
        att = fgsm.generate(x_op, eps=eps)
        return session.run(att, feed_dict={x_op: x})

    # def stm_op(x, eps):
    #     att = stm.generate(x_op, batch_size=len(x), dx_min=-0.1*eps, dx_max=0.1*eps, dy_min=-0.1*eps, dy_max=0.1*eps, angle_min=-30*eps, angle_max=30*eps)
    #     return session.run(att, feed_dict={x_op: x})
    # def cw2_op(x, eps):
    #     att = cw2.generate(x_op, max_iterations=3)
    def pgd_op(x, eps):
        att = pgd.generate(x_op, eps=eps, eps_iter=eps * 0.2, nb_iter=3)
        return session.run(att, feed_dict={x_op: x})

    # def pgd_raw_op(x, eps):
    #     att = pgd_raw.generate(x_op, eps=eps, eps_iter=eps * 0.2, nb_iter=3)
    #     return session.run(att, feed_dict={x_op: x})
    def noise_op(x, eps):
        att = noise.generate(x_op, eps=eps)
        return session.run(att, feed_dict={x_op: x})

    def noise_raw_op(x, eps):
        att = noise_raw.generate(x_op, eps=eps)
        return session.run(att, feed_dict={x_op: x})

    def df_op(x):
        att = df.generate(x_op, nb_candidate=10, max_iter=3)
        return session.run(att, feed_dict={x_op: x})

    def mim_op(x, eps):
        att = mim.generate(x_op, eps=eps, eps_iter=eps * 0.2)
        return session.run(att, feed_dict={x_op: x})

    def attack_train(x):
        attacks = [fgsm_op, pgd_op, mim_op]
        attacks_name = ['FGSM', 'PGD', 'MIM']
        eps = [[0.03, 0.3], [0.03, 0.3], [0.03, 0.3]]
        train_x_adv = x.copy()
        adv_type = np.random.randint(SUBPOLICY_COUNT, size=len(train_x_adv))
        for i, (ti, pi, mi) in enumerate(
                tqdm(zip(t, p, m),
                     total=len(t),
                     desc='Subpolicy: ',
                     leave=False)):
            adv_i = train_x_adv[adv_type == i]
            for j, (tj, pj, mj) in enumerate(
                    tqdm(zip(ti, pi, mi),
                         total=len(ti),
                         desc='Operation: ',
                         leave=False)):
                tj, pj, mj = (*tj, *pj, *mj)
                adv_j = adv_i[np.random.randn(len(adv_i)) < pj]
                for i in tqdm(range(0, len(adv_j), BATCH_SIZE),
                              desc=attacks_name[tj] + ': ',
                              leave=False):
                    adv_j[i:][:BATCH_SIZE] = attacks[tj](
                        adv_j[i:][:BATCH_SIZE], (mj + 1) / MAGN_COUNT *
                        (eps[tj][1] - eps[tj][0]) + eps[tj][0])
        return train_x_adv

    optimizer = optim.SGD(model.parameters(), lr=1e-3)
    raw_optimizer = optim.SGD(raw_model.parameters(), lr=1e-3)
    train_x_adv = attack_train(train_x)
    adv_trainset = torch.utils.data.TensorDataset(
        torch.tensor(train_x_adv, dtype=torch.float),
        torch.tensor(train_y, dtype=torch.long))
    adv_trainloader = torch.utils.data.DataLoader(trainset,
                                                  batch_size=BATCH_SIZE,
                                                  shuffle=True,
                                                  num_workers=4)
    if load_dict:
        model.load_state_dict(torch.load('black_eval_runs/model.pt'))
        optimizer.load_state_dict(torch.load('black_eval_runs/optimizer.pt'))
        raw_model.load_state_dict(torch.load('black_eval_runs/raw_model.pt'))
        raw_optimizer.load_state_dict(
            torch.load('black_eval_runs/raw_optimizer.pt'))
    model.train()
    batch_tqdm = tqdm(adv_trainloader, leave=False)
    for x, y in batch_tqdm:
        optimizer.zero_grad()
        output = model(x.cuda(0))
        loss = criterion(output, y.cuda(0))
        loss.backward()
        optimizer.step()
        acc = torch.sum(output.cpu().argmax(axis=1) == y) / y.size(0)
        batch_tqdm.set_description(f'adv {loss:.3f} {acc:.3f}')
    batch_tqdm = tqdm(trainloader, leave=False)
    raw_model.train()
    for x, y in batch_tqdm:
        raw_optimizer.zero_grad()
        output = raw_model(x.cuda(0))
        loss = criterion(output, y.cuda(0))
        loss.backward()
        raw_optimizer.step()
        acc = torch.sum(output.cpu().argmax(axis=1) == y) / y.size(0)
        batch_tqdm.set_description(f'raw {loss:.3f} {acc:.3f}')
    with torch.no_grad():
        model.eval()
        batch_tqdm = tqdm(valloader, leave=False)
        tot_acc = 0
        for x, y in batch_tqdm:
            output = model(x.cuda(0))
            acc = float(torch.sum(output.cpu().argmax(axis=1) == y))
            tot_acc += acc
        adv_raw_acc = tot_acc / len(val_x)
        val_x_adv = np.zeros_like(val_x)
        for i in tqdm(range(0, len(val_x_adv), BATCH_SIZE),
                      desc='Noise: ',
                      leave=False):
            val_x_adv[i:][:BATCH_SIZE] = noise_op(val_x[i:][:BATCH_SIZE], 0.3)
        adv_valset = torch.utils.data.TensorDataset(
            torch.tensor(val_x_adv, dtype=torch.float),
            torch.tensor(val_y, dtype=torch.long))
        adv_valloader = torch.utils.data.DataLoader(adv_valset,
                                                    batch_size=BATCH_SIZE,
                                                    shuffle=False,
                                                    num_workers=4)
        batch_tqdm = tqdm(adv_valloader, leave=False)
        tot_acc = 0
        for x, y in batch_tqdm:
            output = model(x.cuda(0))
            acc = float(torch.sum(output.cpu().argmax(axis=1) == y))
            tot_acc += acc
        adv_adv_acc = tot_acc / len(val_x)
        raw_model.eval()
        batch_tqdm = tqdm(valloader, leave=False)
        tot_acc = 0
        for x, y in batch_tqdm:
            output = raw_model(x.cuda(0))
            acc = float(torch.sum(output.cpu().argmax(axis=1) == y))
            tot_acc += acc
        raw_raw_acc = tot_acc / len(val_x)
        val_x_adv = np.zeros_like(val_x)
        for i in tqdm(range(0, len(val_x_adv), BATCH_SIZE),
                      desc='Noise: ',
                      leave=False):
            val_x_adv[i:][:BATCH_SIZE] = noise_raw_op(val_x[i:][:BATCH_SIZE],
                                                      0.3)
        adv_valset = torch.utils.data.TensorDataset(
            torch.tensor(val_x_adv, dtype=torch.float),
            torch.tensor(val_y, dtype=torch.long))
        adv_valloader = torch.utils.data.DataLoader(adv_valset,
                                                    batch_size=BATCH_SIZE,
                                                    shuffle=False,
                                                    num_workers=4)
        batch_tqdm = tqdm(adv_valloader, leave=False)
        tot_acc = 0
        for x, y in batch_tqdm:
            output = raw_model(x.cuda(0))
            acc = float(torch.sum(output.cpu().argmax(axis=1) == y))
            tot_acc += acc
        raw_adv_acc = tot_acc / len(val_x)
    with open('black_eval_runs/acc.csv', 'a') as f:
        f.write(f'{adv_raw_acc},{adv_adv_acc},{raw_raw_acc},{raw_adv_acc}\n')
    print(
        f'adv {adv_raw_acc:.3f} -> {adv_adv_acc:.3f} | raw {raw_raw_acc:.3f} -> {raw_adv_acc:.3f}'
    )
    torch.save(model.state_dict(), 'black_eval_runs/model.pt')
    torch.save(optimizer.state_dict(), 'black_eval_runs/optimizer.pt')
    torch.save(raw_model.state_dict(), 'black_eval_runs/raw_model.pt')
    torch.save(raw_optimizer.state_dict(), 'black_eval_runs/raw_optimizer.pt')
Exemplo n.º 21
0
(X_train, y_train), (X_test, y_test) = cifar10.load_data()
if __name__ == '__main__':
    model_keras = keras.models.load_model('../models_test/model_cifar_2.h5')
    # model_keras = keras.models.load_model('../models_test/keras_cifar10_trained_model.h5')
    batch_size = 512
    success = 0

    data_size = X_train.shape[0]
    adv_train = []
    time_st=time.time()
    for st in range(0, data_size, batch_size):
        sample = np.array(X_train[st : st + batch_size].reshape(-1, 32 * 32 * 3) / 255, dtype=np.float)
        # sample = np.array([sample])
        sess = keras.backend.get_session()
        model = KerasModelWrapper(model_keras)
        attack = ProjectedGradientDescent(model, sess=sess)
        # print(model.predict(panda.reshape(1, *panda.shape)))

        param = dict(
                eps= 10 / 255,
                eps_iter= 10 / 255 / 40,
                nb_iter= 40,
                rand_init= True,
                )
        advs = attack.generate_np(sample, **param)
        # plt.imsave("sample.png", advs[0])
        adv_train.append(advs)
        preb = model_keras.predict(advs).argmax(axis=1).reshape((sample.shape[0], ))
        y_sample = model_keras.predict(sample).argmax(axis=1).reshape((sample.shape[0], ))
        success += (preb != y_sample).sum()
        print((preb != y_sample).sum())
def impl(sess,
         model,
         dataset,
         factory,
         x_data,
         y_data,
         base_eps_iter=BASE_EPS_ITER,
         nb_iter=NB_ITER,
         batch_size=BATCH_SIZE):
    """
  The actual implementation of the evaluation.
  :param sess: tf.Session
  :param model: cleverhans.model.Model
  :param dataset: cleverhans.dataset.Dataset
  :param factory: the dataset factory corresponding to `dataset`
  :param x_data: numpy array of input examples
  :param y_data: numpy array of class labels
  :param base_eps_iter: step size for PGD if data were in [0, 1]
  :param nb_iter: number of PGD iterations
  :returns: dict mapping string adversarial example names to accuracies
  """

    center = dataset.kwargs['center']
    max_val = dataset.kwargs['max_val']
    value_range = max_val * (1. + center)
    min_value = 0. - center * max_val

    if 'CIFAR' in str(factory.cls):
        base_eps = 8. / 255.
        if base_eps_iter is None:
            base_eps_iter = 2. / 255.
    elif 'MNIST' in str(factory.cls):
        base_eps = .3
        if base_eps_iter is None:
            base_eps_iter = .1
    else:
        raise NotImplementedError(str(factory.cls))

    pgd_params = {
        'eps': base_eps * value_range,
        'eps_iter': base_eps_iter * value_range,
        'nb_iter': nb_iter,
        'clip_min': min_value,
        'clip_max': max_val
    }

    semantic = Semantic(model, center, max_val, sess)
    pgd = ProjectedGradientDescent(model, sess=sess)

    jobs = [('clean', None, None, None), ('Semantic', semantic, None, None),
            ('pgd', pgd, pgd_params, None)]

    out = {}

    for job in jobs:
        name, attack, attack_params, job_batch_size = job
        if job_batch_size is None:
            job_batch_size = batch_size
        t1 = time.time()
        acc = accuracy(sess,
                       model,
                       x_data,
                       y_data,
                       batch_size=job_batch_size,
                       devices=devices,
                       attack=attack,
                       attack_params=attack_params)
        t2 = time.time()
        out[name] = acc
        print("Accuracy on " + name + " examples: ", acc)
        print("Evaluation took", t2 - t1, "seconds")

    return out
Exemplo n.º 23
0
def main(argv):
    del argv

    if FLAGS.debug:
        logging.info('Running in debug mode!!!')

    random.seed(FLAGS.seed)
    np.random.seed(FLAGS.seed)
    tf.set_random_seed(FLAGS.seed)
    TFRECORDS_DIR = FLAGS.tfrecords_dir
    HDF5_DATA_PATH = FLAGS.hdf5_data_path

    tfrecord_filepaths = glob(os.path.join(TFRECORDS_DIR, '*'))
    tf_dataset = tfutils.make_dataset(tfrecord_filepaths,
                                      batch_size=1,
                                      filter_label=FLAGS.label,
                                      preprocessing_fn=preprocess_input)

    hdf5_dataset = None
    if not FLAGS.debug:
        hdf5_file = h5py.File(HDF5_DATA_PATH, 'a')
        hdf5_group = get_attack_group_name(O_ATTACK_NAME, FLAGS.label)
        hdf5_dataset = hdf5utils.create_image_dataset(hdf5_file,
                                                      group=hdf5_group,
                                                      attrs={
                                                          'seed': FLAGS.seed,
                                                          'eps': FLAGS.eps,
                                                          'ord': FLAGS.ord,
                                                          'eps_iter':
                                                          FLAGS.eps_iter,
                                                          'nb_iter':
                                                          FLAGS.nb_iter,
                                                          'target':
                                                          FLAGS.target
                                                      })

    model = InceptionV1Model()
    iterator = tf_dataset.make_one_shot_iterator()
    x, y = iterator.get_next()

    with tf.Session() as sess:
        attack = ProjectedGradientDescent(model, sess=sess)
        target_one_hot_encoded = get_one_hot_encoded_targets(FLAGS.target)

        x_adv = attack.generate(
            x,
            eps=FLAGS.eps,
            nb_iter=FLAGS.nb_iter,
            eps_iter=FLAGS.eps_iter,
            ord=(int(FLAGS.ord) if FLAGS.ord != 'inf' else np.inf),
            y_target=target_one_hot_encoded)

        pbar = tqdm(unit='imgs')
        try:
            while True:
                attacked_imgs = sess.run(x_adv)

                if not FLAGS.debug:
                    hdf5utils.add_images_to_dataset(attacked_imgs,
                                                    hdf5_dataset)

                pbar.update()
        except tf.errors.OutOfRangeError:
            pass
print(results)    
print("results on target model: ")
results = metrics(model_target, X_adv, X_test, y_test, indices)
print(results)    

#####BIM
print("BIM")
bim_params = {'eps': 0.03,
              'nb_iter': 300,
              'eps_iter': 0.03/100,
              'ord': np.inf,
               'clip_min': 0.,
               'clip_max': 1.,
               'rand_init': False
               }
bim = ProjectedGradientDescent(wrap, sess=sess)
X_adv = np.zeros((len(indices),32,32,3))
for i in range(0,len(indices),batch_attack):
    X_adv[i:i+batch_attack] = bim.generate_np(X_test[indices[i:(i+batch_attack)]], **bim_params)
print("results on source model: ")
results = metrics(model, X_adv, X_test, y_test, indices)
print(results)    
print("results on target model: ")
results = metrics(model_target, X_adv, X_test, y_test, indices)
print(results)   

#####CWL2
print("CWL2")
cwl2_params = {'binary_search_steps': 10,
               'max_iterations': 100,
               'learning_rate': 0.1,
Exemplo n.º 25
0
def fixed_max_confidence_recipe(sess,
                                model,
                                x,
                                y,
                                nb_classes,
                                eps,
                                clip_min,
                                clip_max,
                                eps_iter,
                                nb_iter,
                                report_path,
                                batch_size=BATCH_SIZE):
    """A reasonable attack bundling recipe for a max norm threat model and
  a defender that uses confidence thresholding.

  References:
  https://openreview.net/forum?id=H1g0piA9tQ

  This version runs each attack a fixed number of times.
  It is more exhaustive than `single_run_max_confidence_recipe` but because
  it uses a fixed budget rather than running indefinitely it is more
  appropriate for making fair comparisons between two models.

  :param sess: tf.Session
  :param model: cleverhans.model.Model
  :param x: numpy array containing clean example inputs to attack
  :param y: numpy array containing true labels
  :param nb_classes: int, number of classes
  :param eps: float, maximum size of perturbation (measured by max norm)
  :param eps_iter: float, step size for one version of PGD attacks
    (will also run another version with 25X smaller step size)
  :param nb_iter: int, number of iterations for one version of PGD attacks
    (will also run another version with 25X more iterations)
  :param report_path: str, the path that the report will be saved to.
  :batch_size: int, the total number of examples to run simultaneously
  """
    noise_attack = Noise(model, sess)
    pgd_attack = ProjectedGradientDescent(model, sess)
    threat_params = {"eps": eps, "clip_min": clip_min, "clip_max": clip_max}
    noise_attack_config = AttackConfig(noise_attack, threat_params)
    attack_configs = [noise_attack_config]
    pgd_attack_configs = []
    pgd_params = copy.copy(threat_params)
    pgd_params["eps_iter"] = eps_iter
    pgd_params["nb_iter"] = nb_iter
    assert batch_size % num_devices == 0
    dev_batch_size = batch_size // num_devices
    ones = tf.ones(dev_batch_size, tf.int32)
    expensive_pgd = []
    for cls in range(nb_classes):
        cls_params = copy.copy(pgd_params)
        cls_params['y_target'] = tf.to_float(tf.one_hot(
            ones * cls, nb_classes))
        cls_attack_config = AttackConfig(pgd_attack, cls_params,
                                         "pgd_" + str(cls))
        pgd_attack_configs.append(cls_attack_config)
        expensive_params = copy.copy(cls_params)
        expensive_params["eps_iter"] /= 25.
        expensive_params["nb_iter"] *= 25.
        expensive_config = AttackConfig(pgd_attack, expensive_params,
                                        "expensive_pgd_" + str(cls))
        expensive_pgd.append(expensive_config)
    attack_configs = [noise_attack_config] + pgd_attack_configs + expensive_pgd
    new_work_goal = {config: 5 for config in attack_configs}
    pgd_work_goal = {config: 5 for config in pgd_attack_configs}
    # TODO: lower priority: make sure bundler won't waste time running targeted
    # attacks on examples where the target class is the true class
    goals = [
        Misclassify(new_work_goal={noise_attack_config: 50}),
        Misclassify(new_work_goal=pgd_work_goal),
        MaxConfidence(t=0.5, new_work_goal=new_work_goal),
        MaxConfidence(t=0.75, new_work_goal=new_work_goal),
        MaxConfidence(t=0.875, new_work_goal=new_work_goal),
        MaxConfidence(t=0.9375, new_work_goal=new_work_goal),
        MaxConfidence(t=0.96875, new_work_goal=new_work_goal),
        MaxConfidence(t=0.984375, new_work_goal=new_work_goal),
        MaxConfidence(t=1., new_work_goal=new_work_goal)
    ]
    bundle_attacks(sess, model, x, y, attack_configs, goals, report_path)
Exemplo n.º 26
0
def eval(sess, model_name, X_train, Y_train, X_test, Y_test, cnn=False, rbf=False, fgsm=False, jsma=False, df=False, bim=False):
    """ Load model saved in model_name.json and model_name_weights.h5 and 
    evaluate its accuracy on legitimate test samples and adversarial samples.
    Use cnn=True if the model is CNN based.
    """

    # open text file and output accuracy results to it
    text_file = open("mnist_results.txt", "w")

    # load saved model
    print("Load model ... ")
    '''
    json = open('models/{}.json'.format(model_name), 'r')
    model = json.read()
    json.close()
    loaded_model = model_from_json(model)
    loaded_model.load_weights("models/{}_weights.h5".format(model_name))
    '''
    if rbf:
        loaded_model = load_model("rbfmodels/{}.h5".format(model_name), custom_objects={'RBFLayer': RBFLayer})
        text_file.write('Evaluating on rbfmodels/{}.h5\n\n'.format(model_name))
    else:
        loaded_model = load_model("models/{}.h5".format(model_name))
        text_file.write('Evaluating on models/{}.h5\n\n'.format(model_name))

    # Set placeholders
    if cnn:
        x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    else:
        x = tf.placeholder(tf.float32, shape=(None, 784))

    y = tf.placeholder(tf.float32, shape=(None, 10))

    predictions = loaded_model(x)

    accuracy = model_eval(sess, x, y, predictions, X_test, Y_test, args={ "batch_size" : 128 })
    text_file.write('Test accuracy on legitimate test examples: {0}\n'.format(str(accuracy)))
    #print('Test accuracy on legitimate test examples: ' + str(accuracy))

    # Craft adversarial examples depending on the input parameters
    wrap = KerasModelWrapper(loaded_model)
    
    # FGSM
    if fgsm:
        fgsm = FastGradientMethod(wrap, sess=sess)
        fgsm_params = {'eps': 0.3}
        adv_x = fgsm.generate(x, **fgsm_params)
        adv_x = tf.stop_gradient(adv_x)
        preds_adv = loaded_model(adv_x)

        # Evaluate the accuracy of the MNIST model on adversarial examples
        accuracy = model_eval(sess, x, y, preds_adv, X_test, Y_test, args={ "batch_size" : 128})
        text_file.write('Test accuracy on fgsm adversarial test examples: {0}\n'.format(str(accuracy)))
        #print('Test accuracy on fgsm adversarial test examples: ' + str(accuracy))

    # JSMA
    if jsma:
        jsma = SaliencyMapMethod(wrap, sess=sess)
        jsma_params = {'theta': 2., 'gamma': 0.145,
                   'clip_min': 0., 'clip_max': 1.,
                   'y_target': None}
        adv_x = jsma.generate(x, **jsma_params)
        adv_x = tf.stop_gradient(adv_x)
        preds_adv = loaded_model(adv_x)

        # Evaluate the accuracy of the MNIST model on adversarial examples
        accuracy = model_eval(sess, x, y, preds_adv, X_test, Y_test, args={ "batch_size" : 128})
        text_file.write('Test accuracy on jsma adversarial test examples: {0}\n'.format(str(accuracy)))
        #print('Test accuracy on jsma adversarial test examples: ' + str(accuracy))

    # DeepFool
    if df:
        df = DeepFool(wrap, sess=sess)
        df_params = {'nb_candidate': 10,
                 'max_iter': 50}
        adv_x = df.generate(x, **df_params)
        adv_x = tf.stop_gradient(adv_x)
        preds_adv = loaded_model(adv_x)

        # Evaluate the accuracy of the MNIST model on adversarial examples
        accuracy = model_eval(sess, x, y, preds_adv, X_test, Y_test, args={ "batch_size" : 128})
        text_file.write('Test accuracy on df adversarial test examples: {0}\n'.format(str(accuracy)))
        #print('Test accuracy on df adversarial test examples: ' + str(accuracy))

    # Basic Iterative Method
    if bim:
        bim = ProjectedGradientDescent(wrap, sess=sess)
        bim_params = {'eps': 0.3}
        adv_x = bim.generate(x, **bim_params)
        adv_x = tf.stop_gradient(adv_x)
        preds_adv = loaded_model(adv_x)

        # Evaluate the accuracy of the MNIST model on adversarial examples
        accuracy = model_eval(sess, x, y, preds_adv, X_test, Y_test, args={ "batch_size" : 128})
        text_file.write('Test accuracy on bim adversarial test examples: {0}\n'.format(str(accuracy)))
        #print('Test accuracy on bim adversarial test examples: ' + str(accuracy))

    print('Accuracy results outputted to mnist_results.txt')
    text_file.close()

    # Close TF session
    sess.close()
Exemplo n.º 27
0
def gen_adv_data(model, x, y, method, dataset, batch=2048):
    sess = K.get_session()
    model_wrap = KerasModelWrapper(model)
    if method.upper() == 'CW':
        params = {
            'binary_search_steps': 1,
            'y': y,
            'learning_rate': .1,
            'max_iterations': 50,
            'initial_const': 10,
            'batch_size': batch,
            # 'clip_min': -0.5,
            # 'clip_max': 0.5
        }
        attack = CarliniWagnerL2(model_wrap, sess=sess)

        data_num = x.shape[0]
        begin, end = 0, batch
        adv_x_all = np.zeros_like(x)
        # every time process batch_size
        while end < data_num:
            start_time = time.time()
            params['y'] = y[begin:end]
            adv_x = attack.generate_np(x[begin:end], **params)
            adv_x_all[begin:end] = adv_x
            print(begin, end, "done")
            begin += batch
            end += batch
            end_time = time.time()
            print("time: ", end_time - start_time)

        # process the remaining
        if begin < data_num:
            start_time = time.time()
            params['y'] = y[begin:]
            params['batch_size'] = data_num - begin
            adv_x = attack.generate_np(x[begin:], **params)
            adv_x_all[begin:] = adv_x
            print(begin, data_num, "done")
            end_time = time.time()
            print("time: ", end_time - start_time)

    elif method.upper() == 'PGD':
        if dataset == 'cifar':
            params = {
                'eps': 16. / 255.,
                'eps_iter': 2. / 255.,
                'nb_iter': 30.,
                # 'clip_min': -0.5,
                # 'clip_max': 0.5,
                'y': y
            }
            attack = ProjectedGradientDescent(model_wrap, sess=sess)
        elif dataset == 'mnist':
            params = {
                'eps': .3,
                'eps_iter': .03,
                'nb_iter': 20.,
                'clip_min': -0.5,
                'clip_max': 0.5,
                'y': y
            }
            attack = ProjectedGradientDescent(model_wrap, sess=sess)
        elif dataset == 'svhn':
            params = {
                'eps': 8. / 255.,
                'eps_iter': 0.01,
                'nb_iter': 30.,
                'clip_min': -0.5,
                'clip_max': 0.5,
                'y': y
            }
            attack = ProjectedGradientDescent(model_wrap, sess=sess)

        data_num = x.shape[0]
        begin, end = 0, batch
        adv_x_all = np.zeros_like(x)
        # every time process batch_size
        while end < data_num:
            start_time = time.time()
            params['y'] = y[begin:end]
            adv_x = attack.generate_np(x[begin:end], **params)
            adv_x_all[begin:end] = adv_x
            print(begin, end, "done")
            begin += batch
            end += batch
            end_time = time.time()
            print("time: ", end_time - start_time)

        # process the remaining
        if begin < data_num:
            start_time = time.time()
            params['y'] = y[begin:]
            adv_x = attack.generate_np(x[begin:], **params)
            adv_x_all[begin:] = adv_x
            print(begin, data_num, "done")
            end_time = time.time()
            print("time: ", end_time - start_time)

    else:
        print('Unsupported attack')
        sys.exit(1)

    return adv_x_all
Exemplo n.º 28
0
def train(ARGS):
    # Define helper function for evaluating on test data during training
    def eval(epoch):
        from train_utils import clean_eval
        test_accuracy, test_loss, _ = clean_eval(sess, x, y, is_training,
                                                 testloader, n_classes, logits,
                                                 preds)
        # Write tensorboard summary
        acc_summary = tf.Summary()
        acc_summary.value.add(tag='Evaluation/accuracy/test',
                              simple_value=test_accuracy)
        writer_test.add_summary(acc_summary, epoch)

        # Write tensorboard summary
        err_summary = tf.Summary()
        err_summary.value.add(tag='Evaluation/error/test',
                              simple_value=1.0 - test_accuracy)
        writer_test.add_summary(err_summary, epoch)

        # Write tensorboard summary
        loss_summary = tf.Summary()
        loss_summary.value.add(tag='Evaluation/loss/test',
                               simple_value=test_loss)
        writer_test.add_summary(loss_summary, epoch)

    # Define helper function for evaluating on adversarial test data during training
    def adv_eval(epoch):
        from train_utils import adversarial_eval
        adv_accuracy, adv_loss = adversarial_eval(sess,
                                                  x,
                                                  y,
                                                  is_training,
                                                  adv_testloader,
                                                  n_classes,
                                                  preds,
                                                  adv_preds,
                                                  eval_all=True)

        # Write tensorboard summary
        acc_summary = tf.Summary()
        acc_summary.value.add(tag='Evaluation/adversarial-accuracy/test',
                              simple_value=adv_accuracy)
        writer_test.add_summary(acc_summary, epoch)

        # Write tensorboard summary
        err_summary = tf.Summary()
        err_summary.value.add(tag='Evaluation/adversarial-error/test',
                              simple_value=1.0 - adv_accuracy)
        writer_test.add_summary(err_summary, epoch)

        # Write tensorboard summary
        loss_summary = tf.Summary()
        loss_summary.value.add(tag='Evaluation/adversarial-loss/test',
                               simple_value=adv_loss)
        writer_test.add_summary(loss_summary, epoch)

    # Define computational graph
    with tf.Graph().as_default() as g:
        # Define placeholders
        with tf.device('/gpu:0'):
            with tf.name_scope('Placeholders'):
                x = tf.placeholder(dtype=tf.float32,
                                   shape=input_shape,
                                   name='inputs')
                x_pair1 = tf.placeholder(dtype=tf.float32,
                                         shape=input_shape,
                                         name='x-pair1')
                x_pair2 = tf.placeholder(dtype=tf.float32,
                                         shape=input_shape,
                                         name='x-pair2')
                y = tf.placeholder(dtype=tf.float32,
                                   shape=(None, n_classes),
                                   name='labels')
                is_training = tf.placeholder_with_default(True,
                                                          shape=(),
                                                          name='is-training')

        # Define TF session
        config = tf.ConfigProto(log_device_placement=False,
                                allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        sess = tf.Session(graph=g, config=config)

        # Define model
        with tf.name_scope('Model'):
            with tf.device('/gpu:0'):
                model = Model(nb_classes=n_classes,
                              input_shape=input_shape,
                              is_training=is_training)

                # Define forward-pass
                with tf.name_scope('Logits'):
                    logits = model.get_logits(x)
                with tf.name_scope('Probs'):
                    preds = tf.nn.softmax(logits)

                with tf.name_scope('Accuracy'):
                    ground_truth = tf.argmax(y, axis=1)
                    predicted_label = tf.argmax(preds, axis=1)
                    correct_prediction = tf.equal(predicted_label,
                                                  ground_truth)
                    acc = tf.reduce_mean(tf.to_float(correct_prediction),
                                         name='accuracy')
                    tf.add_to_collection('accuracies', acc)

                    err = tf.identity(1.0 - acc, name='error')
                    tf.add_to_collection('accuracies', err)

                # Define losses
                with tf.name_scope('Losses'):
                    ce_loss, wd_loss, clp_loss, lsq_loss, at_loss, alp_loss = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
                    adv_logits = None

                    if ARGS.ct:
                        with tf.name_scope('Cross-Entropy-Loss'):
                            ce_loss = tf.reduce_mean(
                                tf.nn.softmax_cross_entropy_with_logits(
                                    logits=logits, labels=y),
                                name='cross-entropy-loss')

                            tf.add_to_collection('losses', ce_loss)

                    if ARGS.at:
                        with tf.name_scope('Adversarial-Cross-Entropy-Loss'):
                            at_loss, adv_logits = get_at_loss(
                                sess, x, y, model, ARGS.eps, ARGS.eps_iter,
                                ARGS.nb_iter)
                            at_loss = tf.identity(at_loss, name='at-loss')
                            tf.add_to_collection('losses', at_loss)

                    with tf.name_scope('Regularizers'):
                        if ARGS.wd:
                            with tf.name_scope('Weight-Decay'):
                                for var in tf.trainable_variables():
                                    if 'beta' in var.op.name:
                                        # Do not regularize bias of batch normalization
                                        continue
                                    # print('regularizing: ', var.op.name)
                                    wd_loss += tf.nn.l2_loss(var)

                                reg_loss = tf.identity(wd_loss, name='wd-loss')
                                tf.add_to_collection('losses', reg_loss)

                        if ARGS.alp:
                            with tf.name_scope('Adversarial-Logit-Pairing'):
                                alp_loss = get_alp_loss(
                                    sess, x, y, logits, adv_logits, model,
                                    ARGS.eps, ARGS.eps_iter, ARGS.nb_iter)

                                alp_loss = tf.identity(alp_loss,
                                                       name='alp-loss')
                                tf.add_to_collection('losses', alp_loss)

                        if ARGS.clp:
                            with tf.name_scope('Clean-Logit-Pairing'):
                                clp_loss = get_clp_loss(
                                    x_pair1, x_pair2, model)
                                clp_loss = tf.identity(clp_loss,
                                                       name='clp-loss')
                                tf.add_to_collection('losses', clp_loss)

                        if ARGS.lsq:
                            with tf.name_scope('Logit-Squeezing'):
                                lsq_loss = get_lsq_loss(x, model)
                                lsq_loss = tf.identity(lsq_loss,
                                                       name='lsq-loss')
                                tf.add_to_collection('losses', lsq_loss)

                    with tf.name_scope('Total-Loss'):
                        # Define objective function
                        total_loss = (ARGS.ct_lambda * ce_loss) + (
                            ARGS.at_lambda *
                            at_loss) + (ARGS.wd_lambda * wd_loss) + (
                                ARGS.clp_lambda *
                                clp_loss) + (ARGS.lsq_lambda * lsq_loss) + (
                                    ARGS.alp_lambda * alp_loss)

                        total_loss = tf.identity(total_loss, name='total-loss')
                        tf.add_to_collection('losses', total_loss)

                # Define PGD adversary
                with tf.name_scope('PGD-Attacker'):
                    pgd_params = {
                        'ord': np.inf,
                        'y': y,
                        'eps': ARGS.eps / 255,
                        'eps_iter': ARGS.eps_iter / 255,
                        'nb_iter': ARGS.nb_iter,
                        'rand_init': True,
                        'rand_minmax': ARGS.eps / 255,
                        'clip_min': 0.,
                        'clip_max': 1.,
                        'sanity_checks': True
                    }

                    pgd = ProjectedGradientDescent(model, sess=sess)
                    adv_x = pgd.generate(x, **pgd_params)

                    with tf.name_scope('Logits'):
                        adv_logits = model.get_logits(adv_x)
                    with tf.name_scope('Probs'):
                        adv_preds = tf.nn.softmax(adv_logits)

        # Define optimizer
        with tf.device('/gpu:0'):
            with tf.name_scope('Optimizer'):
                # Define global step variable
                global_step = tf.get_variable(
                    name='global_step',
                    shape=[],  # scalar
                    dtype=tf.float32,
                    initializer=tf.zeros_initializer(),
                    trainable=False)

                optimizer = tf.train.AdamOptimizer(learning_rate=ARGS.lr,
                                                   beta1=0.9,
                                                   beta2=0.999,
                                                   epsilon=1e-6,
                                                   use_locking=False,
                                                   name='Adam')
                trainable_vars = tf.trainable_variables()

                update_bn_ops = tf.get_collection(
                    tf.GraphKeys.UPDATE_OPS
                )  # this collection stores the moving_mean and moving_variance ops
                #  for batch normalization
                with tf.control_dependencies(update_bn_ops):
                    grads_and_vars = optimizer.compute_gradients(
                        total_loss, trainable_vars)
                    train_step = optimizer.apply_gradients(
                        grads_and_vars, global_step=global_step)

        # Add Tensorboard summaries
        with tf.device('/gpu:0'):
            # Create file writers
            writer_train = tf.summary.FileWriter(ARGS.log_dir + '/train',
                                                 graph=g)
            writer_test = tf.summary.FileWriter(ARGS.log_dir + '/test')

            # Add summary for input images
            with tf.name_scope('Image-Summaries'):
                # Create image summary ops
                tf.summary.image('input',
                                 x,
                                 max_outputs=2,
                                 collections=['training'])

            # Add summaries for the training losses
            losses = tf.get_collection('losses')
            for entry in losses:
                tf.summary.scalar(entry.name, entry, collections=['training'])

            # Add summaries for the training accuracies
            accs = tf.get_collection('accuracies')
            for entry in accs:
                tf.summary.scalar(entry.name, entry, collections=['training'])

            # Add summaries for all trainable vars
            for var in trainable_vars:
                tf.summary.histogram(var.op.name,
                                     var,
                                     collections=['training'])
                var_norm = tf.norm(var, ord='euclidean')
                tf.summary.scalar(var.op.name + '/l2norm',
                                  var_norm,
                                  collections=['training'])

            # Add summaries for variable gradients
            for grad, var in grads_and_vars:
                if grad is not None:
                    tf.summary.histogram(var.op.name + '/gradients',
                                         grad,
                                         collections=['training'])
                    grad_norm = tf.norm(grad, ord='euclidean')
                    tf.summary.scalar(var.op.name + '/gradients/l2norm',
                                      grad_norm,
                                      collections=['training'])

            # Add summaries for the logits and model predictions
            with tf.name_scope('Logits-Summaries'):
                variable_summaries(tf.identity(logits, name='logits'),
                                   name='logits',
                                   collections=['training', 'test'],
                                   histo=True)
            with tf.name_scope('Predictions-Summaries'):
                variable_summaries(tf.identity(preds, name='predictions'),
                                   name='predictions',
                                   collections=['training', 'test'],
                                   histo=True)

        # Initialize all variables
        with sess.as_default():
            tf.global_variables_initializer().run()

        # Collect training params
        train_params = {
            'epochs': ARGS.epochs,
            'eval_step': ARGS.eval_step,
            'adv_eval_step': ARGS.adv_eval_step,
            'n_classes': n_classes,
            'clp': ARGS.clp
        }

        # Start training loop
        model_train(sess,
                    x,
                    y,
                    x_pair1,
                    x_pair2,
                    is_training,
                    trainloader,
                    train_step,
                    args=train_params,
                    evaluate=eval,
                    adv_evaluate=adv_eval,
                    writer_train=writer_train)

        # Save the trained model
        if ARGS.save:
            save_path = os.path.join(ARGS.save_dir, ARGS.filename)
            saver = tf.train.Saver(var_list=tf.global_variables())
            saver.save(sess, save_path)
            print("Saved model at {:s}".format(str(ARGS.save_dir)))
Exemplo n.º 29
0
    def attack(self, path, session):
        print_and_log(self.logfile, "")  # add a blank line
        print_and_log(self.logfile, 'Attacking model {0:}: '.format(path))
        self.model = self.init_model()
        self.model.load_state_dict(torch.load(path))
        pgd_parameters = self.pgd_params()

        class_index = 0
        context_images, target_images, context_labels, target_labels, context_images_np = None, None, None, None, None

        def model_wrapper(context_point_x):
            # Insert context_point at correct spot
            context_images_attack = torch.cat([
                context_images[0:class_index], context_point_x,
                context_images[class_index + 1:]
            ],
                                              dim=0)

            target_logits = self.model(context_images_attack, context_labels,
                                       target_images)
            return target_logits[0]

        tf_model_conv = convert_pytorch_model_to_tf(model_wrapper,
                                                    out_dims=self.args.way)
        tf_model = cleverhans.model.CallableModelWrapper(
            tf_model_conv, 'logits')
        pgd = ProjectedGradientDescent(tf_model,
                                       sess=session,
                                       dtypestr='float32')

        for item in self.test_set:

            for t in range(self.args.attack_tasks):

                task_dict = self.dataset.get_test_task(item, session)
                context_images, target_images, context_labels, target_labels, context_images_np = self.prepare_task(
                    task_dict, shuffle=False)
                # Detach shares storage with the original tensor, which isn't what we want.
                context_images_attack_all = context_images.clone()
                # Is require_grad true here, for context_images?

                for c in torch.unique(context_labels):
                    # Adversarial input context image
                    class_index = extract_class_indices(context_labels,
                                                        c)[0].item()
                    context_x = np.expand_dims(context_images_np[class_index],
                                               0)

                    # Input to the model wrapper is automatically converted to Torch tensor for us

                    x = tf.placeholder(tf.float32, shape=context_x.shape)

                    adv_x_op = pgd.generate(x, **pgd_parameters)
                    preds_adv_op = tf_model.get_logits(adv_x_op)

                    feed_dict = {x: context_x}
                    adv_x, preds_adv = session.run((adv_x_op, preds_adv_op),
                                                   feed_dict=feed_dict)

                    context_images_attack_all[class_index] = torch.from_numpy(
                        adv_x)

                    save_image(adv_x,
                               os.path.join(self.checkpoint_dir, 'adv.png'))
                    save_image(context_x,
                               os.path.join(self.checkpoint_dir, 'in.png'))

                    acc_after = torch.mean(
                        torch.eq(
                            target_labels,
                            torch.argmax(torch.from_numpy(preds_adv).to(
                                self.device),
                                         dim=-1)).float()).item()

                    with torch.no_grad():
                        logits = self.model(context_images, context_labels,
                                            target_images)
                        acc_before = torch.mean(
                            torch.eq(target_labels,
                                     torch.argmax(logits,
                                                  dim=-1)).float()).item()
                        del logits

                    diff = acc_before - acc_after
                    print_and_log(
                        self.logfile,
                        "Task = {}, Class = {} \t Diff = {}".format(
                            t, c, diff))

                print_and_log(self.logfile,
                              "Accuracy before {}".format(acc_after))
                logits = self.model(context_images_attack_all, context_labels,
                                    target_images)
                acc_all_attack = torch.mean(
                    torch.eq(target_labels,
                             torch.argmax(logits, dim=-1)).float()).item()
                print_and_log(self.logfile,
                              "Accuracy after {}".format(acc_all_attack))
Exemplo n.º 30
0
 def setUp(self):
     super(TestProjectedGradientDescent, self).setUp()
     self.attack = ProjectedGradientDescent(self.model, sess=self.sess)