model.copy(sess, model_fix) saver = tf.train.Saver() attack_denoiser = LinfPGDAttack(model, config['epsilon'], config['k'], config['a'], config['random_start'], config['loss_func']) # progressive feature matching fea_matching = init_fea(sess, model, model_fix, distance_flag='L_inf') # saver.save(sess, os.path.join(model_dir, 'checkpoint'), global_step=adv_ep) for ii in range(max_num_training_steps): # over all adversarial adata x_batch_nat, y_batch = mnist.train.next_batch(batch_size) x_batch_adv = attack_denoiser.perturb(x_batch_nat, y_batch, sess) adv_dict = {model.x_input: x_batch_adv, model.y_input: y_batch} for i, tag_i in enumerate(fea_matching.tag_list): # layer by layer fea_matching.apply(sess, x_batch_nat, x_batch_adv, y_batch, tag_i) # monitor the accuracy if ii % 100 == 0: ######## training error nat_dict = {model.x_input: x_batch_nat, model.y_input: y_batch} nat_acc = sess.run(model.accuracy, feed_dict=nat_dict) hist_nat_acc += [nat_acc] adv_dict = {model.x_input: x_batch_adv, model.y_input: y_batch} adv_acc = sess.run(model.accuracy, feed_dict=adv_dict)
def advs_train(dataset='cifar-10', loss_name='ce', epochs=120, dynamic_epoch=100, batch_size=128, fosc_max=0.5, epsilon=0.031): """ Adversarial training with PGD attack. """ print( 'DynamicAdvsTrain - Data set: %s, loss: %s, epochs: %s, dynamic_epoch: %s, batch: %s, epsilon: %s' % (dataset, loss_name, epochs, dynamic_epoch, batch_size, epsilon)) X_train, Y_train, X_test, Y_test = get_data(dataset, clip_min=0., clip_max=1., onehot=True) n_images = X_train.shape[0] image_shape = X_train.shape[1:] n_class = Y_train.shape[1] print("n_images:", n_images, "n_class:", n_class, "image_shape:", image_shape) model = get_model(dataset, input_shape=image_shape, n_class=n_class, softmax=True) # model.summary() # create loss if loss_name == 'ce': loss = cross_entropy else: print("New loss function should be defined first.") return optimizer = SGD(lr=0.01, decay=1e-4, momentum=0.9) model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy']) # data augmentation if dataset in ['mnist']: datagen = ImageDataGenerator() elif dataset in ['cifar-10']: datagen = ImageDataGenerator(rotation_range=10, width_shift_range=0.2, height_shift_range=0.2, horizontal_flip=True) else: datagen = ImageDataGenerator(width_shift_range=0.2, height_shift_range=0.2, horizontal_flip=True) datagen.fit(X_train) # pgd attack for training attack = LinfPGDAttack(model, epsilon=epsilon, eps_iter=epsilon / 4, nb_iter=10, random_start=True, loss_func='xent', clip_min=np.min(X_train), clip_max=np.max(X_train)) # initialize logger mylogger = Logger(K.get_session(), model, X_train, Y_train, X_test, Y_test, dataset, loss_name, epochs, suffix='%s' % epsilon) batch_iterator = datagen.flow(X_train, Y_train, batch_size=batch_size) start_time = time.time() for ep in range(epochs): # learning rate decay if (ep + 1) == 60: lr = float(K.get_value(model.optimizer.lr)) K.set_value(model.optimizer.lr, lr / 10.0) if (ep + 1) == 100: lr = float(K.get_value(model.optimizer.lr)) K.set_value(model.optimizer.lr, lr / 10.0) lr = float(K.get_value(model.optimizer.lr)) # a simple linear decreasing of fosc fosc = fosc_max - fosc_max * (ep * 1.0 / dynamic_epoch) fosc = np.max([fosc, 0.0]) steps_per_epoch = int(X_train.shape[0] / batch_size) pbar = tqdm(range(steps_per_epoch)) for it in pbar: batch_x, batch_y = batch_iterator.next() batch_advs, fosc_batch = attack.perturb(K.get_session(), batch_x, batch_y, batch_size, ep, fosc) probs = model.predict(batch_advs) loss_weight = np.max(-batch_y * np.log(probs + 1e-12), axis=1) if it == 0: fosc_all = fosc_batch else: fosc_all = np.concatenate((fosc_all, fosc_batch), axis=0) if ep == 0: loss, acc = model.train_on_batch(batch_advs, batch_y) else: loss, acc = model.train_on_batch(batch_advs, batch_y, sample_weight=loss_weight) pbar.set_postfix(acc='%.4f' % acc, loss='%.4f' % loss) print('All time:', time.time() - start_time) log_path = './log' file_name = os.path.join( log_path, 'BatchSize_{}_Epoch_{}_fosc.npy'.format(batch_size, ep)) np.save(file_name, fosc_all) val_loss, val_acc = model.evaluate(X_test, Y_test, batch_size=batch_size, verbose=0) logs = { 'acc': acc, 'loss': loss, 'val_acc': val_acc, 'val_loss': val_loss } print( "Epoch %s - loss: %.4f - acc: %.4f - val_loss: %.4f - val_acc: %.4f" % (ep, loss, acc, val_loss, val_acc)) # save the log and model every epoch mylogger.on_epoch_end(epoch=ep, logs=logs) model.save_weights("model/advs_%s_%s_%s_%s.hdf5" % (dataset, loss_name, epsilon, ep))
shutil.copy('config.json', model_dir) with tf.Session() as sess: # Initialize the summary writer, global variables, and our time counter. summary_writer = tf.summary.FileWriter(model_dir, sess.graph) sess.run(tf.global_variables_initializer()) training_time = 0.0 # Main training loop for ii in range(max_num_training_steps): x_batch, y_batch = mnist.train.next_batch(batch_size) # Compute Adversarial Perturbations start = timer() x_batch_adv = attack.perturb(x_batch, y_batch, sess, trades=args.trades) end = timer() training_time += end - start full_dict = { x_nat_input: x_batch, x_adv_input: x_batch_adv, y_input: y_batch } # Output to stdout if ii % num_output_steps == 0: nat_acc_batch, adv_acc_batch, xent_batch, kl_batch, grad_reg_loss_batch = sess.run( [ nat_acc, adv_acc, adv_mean_xent
def train(tf_seed, np_seed, train_steps, only_finetune, finetune_train_steps, out_steps, summary_steps, checkpoint_steps, step_size_schedule, weight_decay, momentum, train_batch_size, do_advtrain, do_advreg, epsilon, pgd_steps, step_size, random_start, loss_func, replay_m, model_dir, source_model_dir, dataset, data_dir, beta, gamma, disc_update_steps, adv_update_steps_per_iter, disc_layers, disc_base_channels, steps_before_adv_opt, steps_before_adv_training, adv_encoder_type, enc_output_activation, sep_opt_version, grad_image_ratio, final_grad_image_ratio, num_grad_image_ratios, normalize_zero_mean, eval_adv_attack, same_optimizer, only_fully_connected, disc_avg_pool_hw, finetuned_source_model_dir, train_finetune_source_model, finetune_img_random_pert, img_random_pert, model_suffix, model_type, **kwargs): tf.set_random_seed(tf_seed) np.random.seed(np_seed) # Add pgd params to model name if do_advtrain: model_dir = model_dir + '_AdvTrain' if epsilon != 8: model_dir = model_dir + '_ep%d' % (epsilon) if random_start != True: model_dir = model_dir + '_norandstart' if pgd_steps != 7: model_dir = model_dir + '_%dsteps' % (pgd_steps) if step_size != 2: model_dir = model_dir + '_stepsize%d' % (step_size) model_dir = model_dir + '-{}-'.format(model_type) model_dir = model_dir + 'IGAM-%s_b%d' % ( dataset, train_batch_size) # TODO Replace with not defaults if tf_seed != 451760341: model_dir = model_dir + '_tf_seed%d' % (tf_seed) if np_seed != 216105420: model_dir = model_dir + '_np_seed%d' % (np_seed) model_dir = model_dir + model_suffix # Setting up the data and the model data_path = data_dir #"./datasets/tiny-imagenet/tiny-imagenet-200" raw_data = tinyimagenet_input.TinyImagenetData(data_path) global_step = tf.train.get_or_create_global_step() increment_global_step_op = tf.assign(global_step, global_step + 1) reset_global_step_op = tf.assign(global_step, 0) if model_type == "igamsource": model = ModelTinyImagenetSource( mode='train', dataset='tinyimagenet', train_batch_size=train_batch_size, normalize_zero_mean=normalize_zero_mean) else: model = ModelTinyImagnet(mode='train', dataset='tinyimagenet', train_batch_size=train_batch_size, normalize_zero_mean=normalize_zero_mean) # Setting up the optimizers boundaries = [int(sss[0]) for sss in step_size_schedule][1:] values = [sss[1] for sss in step_size_schedule] learning_rate = tf.train.piecewise_constant(tf.cast(global_step, tf.int32), boundaries, values) c_optimizer = tf.train.MomentumOptimizer(learning_rate, momentum) t_vars = tf.trainable_variables() C_vars = [var for var in t_vars if 'classifier' in var.name] classification_c_loss = model.mean_xent + weight_decay * model.weight_decay_loss total_loss = classification_c_loss classification_final_grads = c_optimizer.compute_gradients( classification_c_loss, var_list=t_vars) classification_no_pert_grad = [(tf.zeros_like(v), v) if 'perturbation' in v.name else (g, v) for g, v in classification_final_grads] c_classification_min_step = c_optimizer.apply_gradients( classification_no_pert_grad) # Setting up the Tensorboard and checkpoint outputs if not os.path.exists(model_dir): os.makedirs(model_dir) saver = tf.train.Saver(max_to_keep=1) tf.summary.scalar('C accuracy', model.accuracy) tf.summary.scalar('C xent', model.xent / train_batch_size) merged_summaries = tf.summary.merge_all() # Set up adversary attack = LinfPGDAttack(model, epsilon, pgd_steps, step_size, random_start, loss_func, dataset=dataset) with tf.Session() as sess: print( 'important params >>> \n model dir: %s \n dataset: %s \n training batch size: %d \n' % (model_dir, dataset, train_batch_size)) # initialize data augmentation data = tinyimagenet_input.AugmentedTinyImagenetData( raw_data, sess, model) # Initialize the summary writer, global variables, and our time counter. summary_writer = tf.summary.FileWriter(model_dir + '/train', sess.graph) eval_summary_writer = tf.summary.FileWriter(model_dir + '/eval') sess.run(tf.global_variables_initializer()) # Main training loop for ii in tqdm(range(train_steps)): x_batch, y_batch = data.train_data.get_next_batch( train_batch_size, multiple_passes=True) if img_random_pert and not (do_advtrain and random_start and ii >= steps_before_adv_training): x_batch = x_batch + np.random.uniform(-epsilon, epsilon, x_batch.shape) x_batch = np.clip(x_batch, 0, 255) # ensure valid pixel range labels_source_modelgrad_disc = np.ones_like(y_batch, dtype=np.int64) nat_dict = {model.x_input: x_batch, model.y_input: y_batch} # Generate adversarial training examples if do_advtrain and ii >= steps_before_adv_training: x_batch_adv = attack.perturb(x_batch, y_batch, sess) train_dict = { model.x_input: x_batch_adv, model.y_input: y_batch } else: train_dict = nat_dict # Output to stdout if ii % summary_steps == 0: train_acc, train_c_loss, summary = sess.run( [model.accuracy, total_loss, merged_summaries], feed_dict=train_dict) summary_writer.add_summary(summary, global_step.eval(sess)) x_eval_batch, y_eval_batch = data.eval_data.get_next_batch( train_batch_size, multiple_passes=True) if img_random_pert and not (do_advtrain and random_start): x_eval_batch = x_eval_batch + np.random.uniform( -epsilon, epsilon, x_eval_batch.shape) x_eval_batch = np.clip(x_eval_batch, 0, 255) # ensure valid pixel range labels_source_modelgrad_disc = np.ones_like(y_eval_batch, dtype=np.int64) eval_nat_dict = { model.x_input: x_eval_batch, model.y_input: y_eval_batch } if do_advtrain: x_eval_batch_adv = attack.perturb(x_eval_batch, y_eval_batch, sess) eval_dict = { model.x_input: x_eval_batch_adv, model.y_input: y_eval_batch } else: eval_dict = eval_nat_dict val_acc, val_c_loss, summary = sess.run( [model.accuracy, total_loss, merged_summaries], feed_dict=eval_dict) eval_summary_writer.add_summary(summary, global_step.eval(sess)) print('Step {}: ({})'.format(ii, datetime.now())) print( ' training nat accuracy {:.4}% -- validation nat accuracy {:.4}%' .format(train_acc * 100, val_acc * 100)) print(' training nat c loss: {}'.format(train_c_loss)) print(' validation nat c loss: {}'.format(val_c_loss)) sys.stdout.flush() # Tensorboard summaries elif ii % out_steps == 0: nat_acc, nat_c_loss = sess.run([model.accuracy, total_loss], feed_dict=train_dict) print('Step {}: ({})'.format(ii, datetime.now())) print(' training nat accuracy {:.4}%'.format(nat_acc * 100)) print(' training nat c loss: {}'.format(nat_c_loss)) # Write a checkpoint if (ii + 1) % checkpoint_steps == 0: saver.save(sess, os.path.join(model_dir, 'checkpoint'), global_step=global_step) sess.run(c_classification_min_step, feed_dict=train_dict) sess.run(increment_global_step_op) # full test evaluation raw_data = tinyimagenet_input.TinyImagenetData(data_path) data_size = raw_data.eval_data.n if data_size % train_batch_size == 0: eval_steps = data_size // train_batch_size else: eval_steps = data_size // train_batch_size # eval_steps = data_size // train_batch_size + 1 total_num_correct = 0 for ii in tqdm(range(eval_steps)): x_eval_batch, y_eval_batch = raw_data.eval_data.get_next_batch( train_batch_size, multiple_passes=False) eval_dict = { model.x_input: x_eval_batch, model.y_input: y_eval_batch } num_correct = sess.run(model.num_correct, feed_dict=eval_dict) total_num_correct += num_correct eval_acc = total_num_correct / data_size clean_eval_file_path = os.path.join(model_dir, 'full_clean_eval_acc.txt') with open(clean_eval_file_path, "a+") as f: f.write("Full clean eval_acc: {}%".format(eval_acc * 100)) print("Full clean eval_acc: {}%".format(eval_acc * 100)) devices = sess.list_devices() for d in devices: print("sess' device names:") print(d.name) return model_dir
## # #讀取未經過對抗訓練的神經網絡 model_path_raw = '/home/zrs/Desktop/adversarial_attacks/cnn_model/model.ckpt' #模型保存地址 load_path = saver.restore(sess, model_path_raw) for step in range(5): # print (step) #epoch = int(epoch / 2) for i in range(epoch): p = (i / epoch) print(p, step) batch_x = train_X[i * batch_size:(i + 1) * batch_size] batch_y = train_Y[i * batch_size:(i + 1) * batch_size] batch_x_adv = attack.perturb(batch_x, batch_y, sess) X_final = np.concatenate([batch_x, batch_x_adv]) y_final = np.concatenate([batch_y, batch_y]) lr = 0.0001 / (1. + 10 * p)**0.75 # sess.run(model.optimizer, feed_dict={model.x_input: batch_x, model.y_input: batch_y, model.learning_rate_1 : lr}) sess.run(model.optimizer, feed_dict={ model.x_input: X_final, model.y_input: y_final, model.learning_rate_1: lr }) # tensorborad_summaries # tensorboard_num = i + epoch * step
class SpatialAttack: def __init__(self, model, config, method=None, worstofk=None, attack_limits=None, fo_epsilon=2.0, fo_step_size=2., fo_num_steps=5): self.model = model self.grid_store = [] if config.use_linf: self.linf_attack = LinfPGDAttack( model, config, fo_epsilon, fo_step_size, fo_num_steps) else: self.linf_attack = None self.use_spatial = config.use_spatial if config.use_spatial: # Attack method if method == None: self.method = config.spatial_method else: self.method = method # Attack parameters if attack_limits == None: self.limits = config.spatial_limits else: self.limits = attack_limits if config.only_rotation: self.limits = [0,0,self.limits[2]] if config.only_translation: self.limits = [self.limits[0],self.limits[1],0] # Attack method parameters if self.method == 'grid': self.granularity = config.grid_granularity elif self.method == 'random': if worstofk == None: self.random_tries = config.random_tries else: self.random_tries = worstofk elif self.method == 'fo': self.fo_attack = SpatialPGDAttack( model, config, fo_epsilon, fo_step_size, fo_num_steps) else: raise NotImplementedError def perturb(self, x_nat, y, sess): if not self.use_spatial: t = np.zeros([len(x_nat), 3]) if self.linf_attack: x = self.linf_attack.perturb(x_nat, y, sess, trans=t) else: x = x_nat return x, t if self.method == 'grid': return self.perturb_grid(x_nat, y, sess, -1) elif self.method == 'fo': return self.fo_attack.perturb(x_nat, y, sess) else: # random return self.perturb_grid(x_nat, y, sess, self.random_tries) def perturb_grid(self, x_nat, y, sess, random_tries=-1): n = len(x_nat) if random_tries > 0: # subsampling this list from the grid is a bad idea, instead we # will randomize each example from the full continuous range grid = [(42, 42, 42) for _ in range(random_tries)] # dummy list else: # exhaustive grid grid = product(*list(np.linspace(-l, l, num=g) for l, g in zip(self.limits, self.granularity))) worst_x = np.copy(x_nat) worst_t = np.zeros([n, 3]) max_xent = np.zeros(n) all_correct = np.ones(n).astype(bool) for tx, ty, r in grid: if random_tries > 0: # randomize each example separately t = np.stack((np.random.uniform(-l, l, n) for l in self.limits), axis=1) else: t = np.stack(repeat([tx, ty, r], n)) if self.linf_attack: x = self.linf_attack.perturb(x_nat, y, sess, trans=t) else: x = x_nat curr_dict = {self.model.x_input: x, self.model.y_input: y, self.model.is_training: False, self.model.transform: t} cur_xent, cur_correct = sess.run([self.model.y_xent, self.model.correct_prediction], feed_dict = curr_dict) # shape (bsize,) cur_xent = np.asarray(cur_xent) cur_correct = np.asarray(cur_correct) # Select indices to update: we choose the misclassified transformation # of maximum xent (or just highest xent if everything else if correct). idx = (cur_xent > max_xent) & (cur_correct == all_correct) idx = idx | (cur_correct < all_correct) max_xent = np.maximum(cur_xent, max_xent) all_correct = cur_correct & all_correct idx = np.expand_dims(idx, axis=-1) # shape (bsize, 1) worst_t = np.where(idx, t, worst_t) # shape (bsize, 3) idx = np.expand_dims(idx, axis=-1) idx = np.expand_dims(idx, axis=-1) # shape (bsize, 1, 1, 1) worst_x = np.where(idx, x, worst_x,) # shape (bsize, 32, 32, 3) return worst_x, worst_t
# Initialize the summary writer, global variables, and our time counter. summary_writer = tf.summary.FileWriter(model_dir, sess.graph) sess.run(tf.global_variables_initializer()) saver.restore( sess, '/home/hope-yao/Documents/mnist_challenge/models/a_very_robust_model_madry/checkpoint-99900' ) training_time = 0.0 # Main training loop for ii in range(max_num_training_steps): x_batch, y_batch = mnist.train.next_batch(batch_size) # Compute Adversarial Perturbations start = timer() x_batch_adv = attack.perturb(x_batch, y_batch, sess) end = timer() training_time += end - start nat_dict = {model.x_input: x_batch, model.y_input: y_batch} adv_dict = {model.x_input: x_batch_adv, model.y_input: y_batch} # Output to stdout if ii % num_output_steps == 0: nat_acc = sess.run(model.accuracy, feed_dict=nat_dict) adv_acc = sess.run(model.accuracy, feed_dict=adv_dict) print('Step {}: ({})'.format(ii, datetime.now())) print(' training nat accuracy {:.4}%'.format(nat_acc * 100)) print(' training adv accuracy {:.4}%'.format(adv_acc * 100)) if ii != 0:
if i % report_batch == 1: np_adv_image = [] np_benign_image = [] np_label = [] np_pgd_image = [] np_pred_normal = [] np_detection_normal = [] np_pred_adv = [] np_detection_adv = [] np_pred_pgd = [] np_detection_pgd = [] x_train_val, y_train_val = get_data(sess) #print(x_train_val[0]) #exit() x_train_perturbed = pgd_attack.perturb(x_train_val, y_train_val, sess) fdict = {content: x_train_perturbed, label: y_train_val} _acc = sess.run(norm_acc, feed_dict=fdict) fdict = {content: x_train_val, label: y_train_val} grad_attack() x_train_style = sess.run(adv_img, feed_dict=fdict) print("result normal:") _, p_set_normal, p_det_normal = gaussdetect.detect( x_train_val, y_train_val, batch_size=BATCH_SIZE) print("result pgd:") _, p_set_pgd, p_det_pgd = gaussdetect.detect(x_train_perturbed, y_train_val, batch_size=BATCH_SIZE)
cur_ckpt = args.ckpt with tf.Session() as sess: for i in range(args.atta_loop): x_batch = mnist.test.images[batch_start:batch_start + 500] y_batch = mnist.test.labels[batch_start:batch_start + 500] x_batch_adv = x_batch.copy() path = args.log_prefix + str(i + 1) + ".log" print(path) log_file = open(path, 'w') print(os.path.join(model_dir, "checkpoint-" + str(cur_ckpt))) model_ckpt = os.path.join(model_dir, "checkpoint-" + str(cur_ckpt)) saver.restore(sess, model_ckpt) x_batch_adv = attack.perturb(x_batch, y_batch, sess, log_file) nat_dict = {model.x_input: x_batch, model.y_input: y_batch} adv_dict = {model.x_input: x_batch_adv, model.y_input: y_batch} nat_loss = sess.run(model.mean_xent, feed_dict=nat_dict) loss = sess.run(model.mean_xent, feed_dict=adv_dict) print("adv loss: {}".format(loss)) print("nat-loss: {}".format(nat_loss)) log_file.close() batch_start += 500
def train_model(dataset, config, plotter, adversarial, mixed): clear_session() # Set seeds tf.set_random_seed(config['random_seed']) np.random.seed(config['random_seed']) # Set save directory model_dir = "" if adversarial: if mixed: model_dir = config['model_dir_adv_mixed'] else: model_dir = config['model_dir_adv'] else: if mixed: model_dir = config['model_dir_mixed'] else: model_dir = config['model_dir'] if not os.path.exists(model_dir): os.makedirs(model_dir) # Set dataset x_vals = dataset['X_train'] y_vals = dataset['Y_train'] # Get parameters batch_size = config['batch_size'] weight_decay = config['weight_decay'] C = 1.0 / (batch_size * weight_decay) # C = config['C'] learning_rate = config['learning_rate'] # Setup tensorflow objects svm_model = Model(batch_size, C=C) global_step = tf.compat.v1.train.get_or_create_global_step() attack = LinfPGDAttack(svm_model, config['epsilon'], config['k'], config['a'], config['random_start'], config['momentum'], config['beta'], config['random_seed'], plotter=plotter) # Set optimizer for model training my_opt = tf.train.GradientDescentOptimizer(learning_rate=learning_rate) train_step = my_opt.minimize(svm_model.loss) init = tf.initialize_all_variables() # Variables used during training X = None Y = None clean_loss_history = [] clean_accuracy_history = [] robust_loss_history = [] robust_accuracy_history = [] train_history = {} X_adv = None X_adv_save = None Y_save = None Y = None A = None b = None # Start tensorflow session with tf.Session() as sess: sess.run(init) # Training: Batch Gradient Descent for i in range(100): # Orig: 100 # Create randomly selected batch rand_index = np.random.choice(len(x_vals), size=batch_size) X = x_vals[rand_index] Y = np.transpose([y_vals[rand_index]]) # In case of adversarial training we perturb the batch data X_adv = None if adversarial: X_adv = attack.perturb(X, Y, sess, debug=i == 40) X = X_adv # Storing batch set performance clean_loss = sess.run(svm_model.loss, feed_dict={ svm_model.x_input: X, svm_model.y_input: Y }) clean_acc = sess.run(svm_model.accuracy, feed_dict={ svm_model.x_input: X, svm_model.y_input: Y, svm_model.prediction_grid: X }) robust_loss = 0 robust_acc = 0 if adversarial: robust_loss = sess.run(svm_model.loss, feed_dict={ svm_model.x_input: X_adv, svm_model.y_input: Y }) robust_acc = sess.run(svm_model.accuracy, feed_dict={ svm_model.x_input: X_adv, svm_model.y_input: Y, svm_model.prediction_grid: X_adv }) if (i + 1) % 1 == 0: print('\nStep #' + str(i + 1)) print('Clean Loss = ' + str(clean_loss)) print('Clean Accuracy = ' + str(clean_acc)) if adversarial: print('Robust Loss = ' + str(robust_loss)) print('Robust Accuracy = ' + str(robust_acc)) clean_loss_history.append(str(clean_loss[0][0])) clean_accuracy_history.append(str(clean_acc)) if adversarial: robust_loss_history.append(str(robust_loss[0][0])) robust_accuracy_history.append(str(robust_acc)) # Train model if adversarial: if not mixed: X_adv_save = X_adv Y_save = Y # print(X_adv - X) sess.run(train_step, feed_dict={ svm_model.x_input: X_adv, svm_model.y_input: Y }) else: sess.run(train_step, feed_dict={ svm_model.x_input: X, svm_model.y_input: Y }) plotter.plot(sess, model=svm_model, X=X, Y=Y, train_iter=i, pgd_attack=False) # Save model A = sess.run(svm_model.A) b = sess.run(svm_model.b) saver = tf.train.Saver(max_to_keep=3) if adversarial: if mixed: saver.save(sess, os.path.join( config['model_dir_adv_mixed'], 'model_' + 'batch-size-' + str(config['batch_size']) + '_C-' + str(config['C']) + '_learning-rate-' + str(config['learning_rate'])), global_step=global_step) else: saver.save( sess, os.path.join( config['model_dir_adv'], 'model_' + 'batch-size-' + str(config['batch_size']) + '_C-' + str(config['C']) + '_learning-rate-' + str(config['learning_rate'])), global_step=global_step) else: if mixed: saver.save( sess, os.path.join( config['model_dir_mixed'], 'model_' + 'batch-size-' + str(config['batch_size']) + '_C-' + str(config['C']) + '_learning-rate-' + str(config['learning_rate'])), global_step=global_step) else: saver.save( sess, os.path.join( config['model_dir'], 'model_' + 'batch-size-' + str(config['batch_size']) + '_C-' + str(config['C']) + '_learning-rate-' + str(config['learning_rate'])), global_step=global_step) train_history['clean loss'] = clean_loss_history train_history['clean accuracy'] = clean_accuracy_history train_history['robust loss'] = robust_loss_history train_history['robust accuracy'] = robust_accuracy_history train_history['A'] = A train_history['b'] = b # print(train_history) data = {'X': X_adv_save, 'Y': Y_save} # print(data) with open('gaussian_perturbed_train_test.npz', 'wb') as f: pickle.dump(data, f, protocol=2) return train_history
targeted=False, num_classes=10, elementwise_best=True) n_total = 0 n_correct = 0 train_adv_data = [] train_adv_labels = [] for i, (img, label) in enumerate(train_dataloader): #img = img.expand(img.data.shape[0], 3, 28, 28) batch_size = img.shape[0] img = img.cuda() label = label.cuda() adv_img = attacker.perturb(img, label) train_adv_data.extend(adv_img.cpu().numpy()) train_adv_labels.extend(label.cpu().numpy()) adv_output= model(input_data=adv_img) pred = adv_output.data.max(1, keepdim=True)[1] n_correct += pred.eq(label.data.view_as(pred)).cpu().sum() n_total += batch_size print('Process {}'.format(n_total)) accu = n_correct.data.numpy() * 1.0 / n_total print('Adv acc:', accu) adv_data_save_path_train = 'dataset/adv_mnist/train' os.makedirs(adv_data_save_path_train, exist_ok=True)
def evaluate_checkpoint(filename, weight_prune, tolerance, relu_prune, relu_prune_frac): with tf.Session() as sess: # Restore the checkpoint saver.restore(sess, filename) print('restored checkpoint for {}'.format(filename)) print('First eval - no changes') x_single_train = mnist.train.images[0:1, :] y_single_train = mnist.train.labels[0:1] dict_nat_single = { model.x_input: x_single_train, model.x_input_natural: x_single_train, model.y_input: y_single_train } # Get the variables c1_v = [x for x in tf.global_variables() if x.op.name == 'Variable'][0] c1_b = [x for x in tf.global_variables() if x.op.name == 'Variable_1'][0] c2_v = [x for x in tf.global_variables() if x.op.name == 'Variable_2'][0] c2_b = [x for x in tf.global_variables() if x.op.name == 'Variable_3'][0] fc_v = [x for x in tf.global_variables() if x.op.name == 'Variable_4'][0] fc_b = [x for x in tf.global_variables() if x.op.name == 'Variable_5'][0] sm_v = [x for x in tf.global_variables() if x.op.name == 'Variable_6'][0] sm_b = [x for x in tf.global_variables() if x.op.name == 'Variable_7'][0] # Save values in the final variables c1, c1b, c2, c2b, fc, fcb, sm, smb = sess.run( [c1_v, c1_b, c2_v, c2_b, fc_v, fc_b, sm_v, sm_b], feed_dict=dict_nat_single) if do_eval: # Iterate over the eval samples batch-by-batch num_batches = int(math.ceil(num_eval_examples / eval_batch_size)) total_corr_nat = 0 total_corr_adv = 0 tot_unstable1n = 0 tot_unstable2n = 0 tot_unstable3n = 0 for ibatch in range(num_batches): bstart = ibatch * eval_batch_size bend = min(bstart + eval_batch_size, num_eval_examples) x_batch = mnist.test.images[bstart:bend, :] y_batch = mnist.test.labels[bstart:bend] dict_nat = { model.x_input: x_batch, model.x_input_natural: x_batch, model.y_input: y_batch } x_batch_adv = attack.perturb(x_batch, y_batch, sess) dict_adv = { model.x_input: x_batch_adv, model.x_input_natural: x_batch, model.y_input: y_batch } cur_corr_nat = sess.run(model.num_correct, feed_dict=dict_nat) cur_corr_adv = sess.run(model.num_correct, feed_dict=dict_adv) total_corr_nat += cur_corr_nat total_corr_adv += cur_corr_adv un1n, un2n, un3n = \ sess.run([model.unstable1, model.unstable2, \ model.unstable3], feed_dict = dict_nat) tot_unstable1n += np.sum(un1n) tot_unstable2n += np.sum(un2n) tot_unstable3n += np.sum(un3n) avg_un1n = tot_unstable1n / num_eval_examples avg_un2n = tot_unstable2n / num_eval_examples avg_un3n = tot_unstable3n / num_eval_examples acc_nat = total_corr_nat / num_eval_examples acc_adv = total_corr_adv / num_eval_examples print('natural: {:.2f}%'.format(100 * acc_nat)) print('adversarial: {:.2f}%'.format(100 * acc_adv)) print(' un1n, un2n, un3n: {}, {}, {}'.format( avg_un1n, avg_un2n, avg_un3n)) if weight_prune: print('Second eval - prune small weights') # Hardcoded variables prune_small_weights([c1_v, c2_v, fc_v], sess, tolerance) # These are the correct values (no need to refix-nonzeros) for the masked models c1, c1b, c2, c2b, fc, fcb, sm, smb = sess.run( [c1_v, c1_b, c2_v, c2_b, fc_v, fc_b, sm_v, sm_b], feed_dict=dict_nat_single) if do_eval: # Iterate over the eval samples batch-by-batch num_batches = int( math.ceil(num_eval_examples / eval_batch_size)) total_corr_nat = 0 total_corr_adv = 0 tot_unstable1n = 0 tot_unstable2n = 0 tot_unstable3n = 0 for ibatch in range(num_batches): bstart = ibatch * eval_batch_size bend = min(bstart + eval_batch_size, num_eval_examples) x_batch = mnist.test.images[bstart:bend, :] y_batch = mnist.test.labels[bstart:bend] dict_nat = { model.x_input: x_batch, model.x_input_natural: x_batch, model.y_input: y_batch } x_batch_adv = attack.perturb(x_batch, y_batch, sess) dict_adv = { model.x_input: x_batch_adv, model.x_input_natural: x_batch, model.y_input: y_batch } cur_corr_nat = sess.run(model.num_correct, feed_dict=dict_nat) cur_corr_adv = sess.run(model.num_correct, feed_dict=dict_adv) total_corr_nat += cur_corr_nat total_corr_adv += cur_corr_adv un1n, un2n, un3n = \ sess.run([model.unstable1, model.unstable2, \ model.unstable3], feed_dict = dict_nat) tot_unstable1n += np.sum(un1n) tot_unstable2n += np.sum(un2n) tot_unstable3n += np.sum(un3n) avg_un1n = tot_unstable1n / num_eval_examples avg_un2n = tot_unstable2n / num_eval_examples avg_un3n = tot_unstable3n / num_eval_examples acc_nat = total_corr_nat / num_eval_examples acc_adv = total_corr_adv / num_eval_examples print('natural: {:.2f}%'.format(100 * acc_nat)) print('adversarial: {:.2f}%'.format(100 * acc_adv)) print(' un1n, un2n, un3n: {}, {}, {}'.format( avg_un1n, avg_un2n, avg_un3n)) if relu_prune: print('Third eval - prune relus') # Get locations of where relus are equal (or close) to 0 or 55000 h1_rc = tf.reduce_sum(tf.cast(model.h_1 > 0, tf.int32), axis=0) h2_rc = tf.reduce_sum(tf.cast(model.h_2 > 0, tf.int32), axis=0) hfc_rc = tf.reduce_sum(tf.cast(model.h_fc_pre_relu > 0, tf.int32), axis=0) # Iterate over the training samples batch-by-batch to do relu count num_training_batches = int( math.ceil(num_training_examples / eval_batch_size)) # Only do relu count for adv training examples only, since DNN is trained on adv tot_rc1 = 0 tot_rc2 = 0 tot_rfc = 0 for ibatch in range(num_training_batches): bstart = ibatch * eval_batch_size bend = min(bstart + eval_batch_size, num_training_examples) x_batch = mnist.train.images[bstart:bend, :] y_batch = mnist.train.labels[bstart:bend] x_batch_adv = attack.perturb(x_batch, y_batch, sess) dict_adv = { model.x_input: x_batch_adv, model.x_input_natural: x_batch, model.y_input: y_batch } rc1_adv = sess.run(h1_rc, feed_dict=dict_adv) rc2_adv = sess.run(h2_rc, feed_dict=dict_adv) rfc_adv = sess.run(hfc_rc, feed_dict=dict_adv) tot_rc1 += rc1_adv tot_rc2 += rc2_adv tot_rfc += rfc_adv def get_ops(adv, relu_prune_frac): num_to_remove = int(num_training_examples * relu_prune_frac) assert (num_to_remove <= num_training_examples / 2 + 1) linear_relus = adv >= (num_training_examples - num_to_remove) zero_relus = adv <= num_to_remove ops = np.zeros(adv.shape) ops[linear_relus] = 1 ops[zero_relus] = -1 print("number of relus left: ", len(ops[ops == 0])) return ops c1_ops = get_ops(tot_rc1, relu_prune_frac) c2_ops = get_ops(tot_rc2, relu_prune_frac) fc_ops = get_ops(tot_rfc, relu_prune_frac) if do_eval: mask_model = models.MNIST_naive_ia_masked.Model( config, c1_ops, c2_ops, fc_ops) mask_model_attack = LinfPGDAttack(mask_model, config['epsilon'], config['k'], config['a'], config['random_start'], config['loss_func']) print("Created masked model") # Copy variables over from main model new_c1_v = [ x for x in tf.global_variables() if x.op.name == 'Variable_8' ][0] new_c1_b = [ x for x in tf.global_variables() if x.op.name == 'Variable_9' ][0] new_c2_v = [ x for x in tf.global_variables() if x.op.name == 'Variable_10' ][0] new_c2_b = [ x for x in tf.global_variables() if x.op.name == 'Variable_11' ][0] new_fc_v = [ x for x in tf.global_variables() if x.op.name == 'Variable_12' ][0] new_fc_b = [ x for x in tf.global_variables() if x.op.name == 'Variable_13' ][0] new_sm_v = [ x for x in tf.global_variables() if x.op.name == 'Variable_14' ][0] new_sm_b = [ x for x in tf.global_variables() if x.op.name == 'Variable_15' ][0] new_c1_v.assign(c1).eval() new_c1_b.assign(c1b).eval() new_c2_v.assign(c2).eval() new_c2_b.assign(c2b).eval() new_fc_v.assign(fc).eval() new_fc_b.assign(fcb).eval() new_sm_v.assign(sm).eval() new_sm_b.assign(smb).eval() # Iterate over the eval samples batch-by-batch num_batches = int( math.ceil(num_eval_examples / eval_batch_size)) total_corr_nat = 0 total_corr_adv = 0 tot_unstable1n = 0 tot_unstable2n = 0 tot_unstable3n = 0 for ibatch in range(num_batches): bstart = ibatch * eval_batch_size bend = min(bstart + eval_batch_size, num_eval_examples) x_batch = mnist.test.images[bstart:bend, :] y_batch = mnist.test.labels[bstart:bend] dict_nat = { mask_model.x_input: x_batch, mask_model.x_input_natural: x_batch, mask_model.y_input: y_batch } x_batch_adv = mask_model_attack.perturb( x_batch, y_batch, sess) dict_adv = { mask_model.x_input: x_batch_adv, mask_model.x_input_natural: x_batch, mask_model.y_input: y_batch } cur_corr_nat = sess.run(mask_model.num_correct, feed_dict=dict_nat) cur_corr_adv = sess.run(mask_model.num_correct, feed_dict=dict_adv) total_corr_nat += cur_corr_nat total_corr_adv += cur_corr_adv un1n, un2n, un3n = \ sess.run([mask_model.unstable1, mask_model.unstable2, \ mask_model.unstable3], feed_dict = dict_nat) tot_unstable1n += np.sum(un1n) tot_unstable2n += np.sum(un2n) tot_unstable3n += np.sum(un3n) avg_un1n = tot_unstable1n / num_eval_examples avg_un2n = tot_unstable2n / num_eval_examples avg_un3n = tot_unstable3n / num_eval_examples acc_nat = total_corr_nat / num_eval_examples acc_adv = total_corr_adv / num_eval_examples print('natural: {:.2f}%'.format(100 * acc_nat)) print('adversarial: {:.2f}%'.format(100 * acc_adv)) print(' un1n, un2n, un3n: {}, {}, {}'.format( avg_un1n, avg_un2n, avg_un3n)) new_model_weights = { 'c1_w': c1, 'c1_b': c1b, 'c2_w': c2, 'c2_b': c2b, 'fc_w': fc, 'fc_b': fcb, 'sm_w': sm, 'sm_b': smb, } if relu_prune: new_model_weights['c1_m'] = c1_ops new_model_weights['c2_m'] = c2_ops new_model_weights['fc_m'] = fc_ops return new_model_weights
model_number = i + 1 path = args.log_prefix + str(model_number) + ".log" print(path) log_file = open(path, 'w') log_loss = [0 for x in range(args.atta_max_step + 1)] total_nat_loss = 0 total_adv_loss = 0 for batch_start in range(s, s + 256, 64): x_batch = cifar.train_data.xs[batch_start:batch_start + 64] y_batch = cifar.train_data.ys[batch_start:batch_start + 64] saver.restore(sess, model_ckpt) x_batch_adv = attack.perturb(x_batch, y_batch, sess, log_loss, step=args.atta_max_step) # nat_dict = {model.x_input: x_batch, # model.y_input: y_batch} # adv_dict = {model.x_input: x_batch_adv, # model.y_input: y_batch} # # nat_loss = sess.run(model.mean_xent, feed_dict=nat_dict) # loss = sess.run(model.mean_xent, feed_dict=adv_dict) # # print("adv loss: {}".format(loss)) # print("nat-loss: {}".format(nat_loss)) # print("per: {}%".format(loss / nat_loss * 100)) for ii in range(args.atta_max_step):
global_step=global_step) # saver_pretrained = tf.train.Saver(var_list = [v for v in tf.trainable_variables() if v.name in ['Variable_8:0','Variable_9:0']]) # saver_pretrained.restore(sess, './models/pretrained_robust_model/95000/checkpoint_0-95000') # saver = tf.train.Saver() sess.run(tf.global_variables_initializer()) # saver.restore(sess,'/home/hope-yao/Documents/mnist_challenge_voting/denoiser') training_time = 0.0 for ii in range(5000): x_batch, y_batch = mnist.train.next_batch(batch_size) nat_dict = {model.x_input: x_batch, model.y_input: y_batch} if 0: # Compute Adversarial Perturbations start = timer() x_batch_adv = attack.perturb(x_batch, y_batch, sess) end = timer() training_time += end - start adv_dict = {model.x_input: x_batch_adv, model.y_input: y_batch} sess.run(pre_train_step, feed_dict=nat_dict) if ii % 100 == 0: x_batch, y_batch = mnist.test.next_batch(batch_size) nat_dict = {model.x_input: x_batch, model.y_input: y_batch} # Output to stdout nat_acc = sess.run(model.accuracy, feed_dict=nat_dict) #adv_acc = sess.run(model.accuracy, feed_dict=adv_dict) print('Step {}: ({})'.format(ii, datetime.now())) print(' test nat accuracy {:.4}%'.format(nat_acc * 100)) #print(' test adv accuracy {:.4}%'.format(adv_acc * 100))
y_batch = np.eye(num_classes)[y_batch] # one hot coding # Compute Adversarial Perturbations start = timer() if config['AVmixup']: x_batch_adv, y_batch_adv = attack.perturb_avmixup( x_batch, y_batch, config['gamma'], config['lambda1'], config['lambda2'], sess, is_training=True) else: x_batch_adv = attack.perturb(x_batch, y_batch, sess, is_training=True) y_batch_adv = y_batch end = timer() training_time += end - start nat_dict = { model.x_input: x_batch, model.is_training: True, model.y_input: y_batch } adv_dict = { model.x_input: x_batch_adv, model.is_training: True, model.y_input: y_batch_adv
class SpatialAttack: def __init__(self, model, config): self.model = model self.grid_store = [] if config.use_linf: self.linf_attack = LinfPGDAttack(model, config) else: self.linf_attack = None self.use_spatial = config.use_spatial self.attack_method = config.attack_method if config.use_spatial: self.method = config.spatial_method self.limits = config.spatial_limits if self.method == 'grid': self.granularity = config.grid_granularity elif self.method == 'random': self.random_tries = config.random_tries elif self.method == 'max': self.random_tries = config.random_tries def perturb(self, x_nat, y, sess): if not self.use_spatial: t = np.zeros([len(x_nat), 3]) if self.linf_attack: x = self.linf_attack.perturb(x_nat, y, sess, trans=t) else: x = x_nat return x, t if self.method == 'grid': return self.perturb_grid(x_nat, y, sess, -1) else: # random return self.perturb_grid(x_nat, y, sess, self.random_tries) def perturb_grid(self, x_nat, y, sess, random_tries=-1): n = len(x_nat) if random_tries > 0: # subsampling this list from the grid is a bad idea, instead we # will randomize each example from the full continuous range grid = [(42, 42, 42) for _ in range(random_tries)] # dummy list else: # exhaustive grid grid = product(*list(np.linspace(-l, l, num=g) for l, g in zip(self.limits, self.granularity))) worst_x = np.copy(x_nat) worst_t = np.zeros([n, 3]) max_xent = np.zeros(n) all_correct = np.ones(n).astype(bool) for tx, ty, r in grid: if random_tries > 0: if self.method == 'max': #In config, specify limits as [0 0 90] for 0 translation, #but 90 rotation (either 0 or 90 is selected, nothing in between) t = np.stack((np.random.randint(0, 1+1, n)*l for l in self.limits), axis=1) else: # Allows to set spatial limits in different ways like: # limits = [3,3,30] - original [low, high) for each element # limits = [[-3,3],[0,3],[20,30]] - within range # limits = [3,[3],[20,30]] - mix, if list_len == 1 do original temp = [] for l in self.limits: if isinstance(l, list): if len(l) == 2: temp.append(np.random.uniform(l[0], l[1], n)) elif len(l) == 1: temp.append(np.random.uniform(-l[0], l[0], n)) else: raise ValueError else: temp.append(np.random.uniform(-l, l, n)) t = np.stack(temp, axis=1) else: t = np.stack(repeat([tx, ty, r], n)) if self.linf_attack: x = self.linf_attack.perturb(x_nat, y, sess, trans=t) else: if self.attack_method == 'invert': # IPython.embed() x = v_invert_image(x_nat) elif self.attack_method == 'edge': x = canny_image(x_nat) else: x = x_nat curr_dict = {self.model.x_input: x, self.model.y_input: y, self.model.is_training: False, self.model.transform: t} cur_xent, cur_correct = sess.run([self.model.y_xent, self.model.correct_prediction], feed_dict = curr_dict) # shape (bsize,) cur_xent = np.asarray(cur_xent) cur_correct = np.asarray(cur_correct) # Select indices to update: we choose the misclassified transformation # of maximum xent (or just highest xent if everything else if correct). idx = (cur_xent > max_xent) & (cur_correct == all_correct) idx = idx | (cur_correct < all_correct) max_xent = np.maximum(cur_xent, max_xent) all_correct = cur_correct & all_correct idx = np.expand_dims(idx, axis=-1) # shape (bsize, 1) worst_t = np.where(idx, t, worst_t) # shape (bsize, 3) idx = np.expand_dims(idx, axis=-1) idx = np.expand_dims(idx, axis=-1) # shape (bsize, 1, 1, 1) worst_x = np.where(idx, x, worst_x,) # shape (bsize, 32, 32, 3) return worst_x, worst_t
summary_writer_eval = tf.summary.FileWriter(eval_dir) sess.run(tf.global_variables_initializer()) # checkpoint = tf.train.latest_checkpoint(model_dir) # saver.restore(sess, checkpoint) training_time = 0.0 # Main training loop for ii in range(max_num_training_steps + 1): x_batch, y_batch = training_data.get_next_batch(batch_size, multiple_passes=True) # Compute Adversarial Perturbations start = timer() if adv_training: x_batch_adv = attack.perturb(x_batch, y_batch, sess, ii / max_num_training_steps) else: x_batch_adv = x_batch end = timer() training_time += end - start nat_dict = { model.x_input: x_batch, model.x_input_natural: x_batch, model.y_input: y_batch } adv_dict = { model.x_input: x_batch_adv, model.x_input_natural: x_batch, model.y_input: y_batch
def main(cfg): img_size = cfg['img_size'] batch_size = cfg['batch_size'] num_glimpse = cfg['num_glimpse'] glimpse_size = cfg['glimpse_size'] lr = cfg['lr'] input_images = tf.placeholder(tf.float32, shape=(batch_size, img_size, img_size, 1)) input_label = tf.placeholder(tf.int64, shape=(batch_size)) # build classifier #model = Model_att(input_images, input_label, glimpse_size, num_glimpse) # model = Model_madry(input_images, input_label) model = Model_crop(input_images, input_label) # setup attacker attack = LinfPGDAttack(model, epsilon=0.3, k=40, a=0.01, random_start=True, loss_func='xent') ## OPTIMIZER ## learning_rate = tf.Variable(lr) # learning rate for optimizer optimizer = tf.train.AdamOptimizer(learning_rate, beta1=0.5) grads = optimizer.compute_gradients(model.xent) train_op = optimizer.apply_gradients(grads) saver = tf.train.Saver() ## training starts ### FLAGS = tf.app.flags.FLAGS tfconfig = tf.ConfigProto( allow_soft_placement=True, log_device_placement=True, ) tfconfig.gpu_options.allow_growth = True sess = tf.Session(config=tfconfig) init = tf.global_variables_initializer() sess.run(init) mnist = input_data.read_data_sets('MNIST_data', one_hot=False) hist = { 'train_acc': [], 'train_adv_acc': [], 'test_acc': [], 'test_adv_acc': [], 'train_loss': [], 'test_loss': [], 'train_adv_loss': [], 'test_adv_loss': [] } train_iters = 500000 for itr in tqdm(range(train_iters)): x_batch_train, y_batch_train = mnist.train.next_batch(batch_size) if 1: # adv train x_batch_train_adv = attack.perturb( x_batch_train.reshape(batch_size, img_size, img_size, 1), y_batch_train, sess) adv_dict_train = { input_images: x_batch_train_adv.reshape(batch_size, img_size, img_size, 1), input_label: y_batch_train } nat_dict_train = { input_images: x_batch_train.reshape(batch_size, img_size, img_size, 1), input_label: y_batch_train } sess.run(train_op, feed_dict=adv_dict_train) else: # nat train nat_dict_train = { input_images: x_batch_train.reshape(batch_size, img_size, img_size, 1), input_label: y_batch_train } sess.run(train_op, feed_dict=nat_dict_train) if itr % 100 == 0: y_pred, train_loss_i = sess.run([model.y_pred, model.xent], feed_dict=nat_dict_train) counts = np.asarray([ np.argmax(np.bincount(y_pred[:, i])) for i in range(batch_size) ]) train_acc_i = np.mean(counts == nat_dict_train[input_label]) x_batch_test, y_batch_test = mnist.test.next_batch(batch_size) nat_dict_test = { input_images: x_batch_test.reshape(batch_size, img_size, img_size, 1), input_label: y_batch_test } y_pred, test_loss_i = sess.run([model.y_pred, model.xent], feed_dict=nat_dict_test) counts = np.asarray([ np.argmax(np.bincount(y_pred[:, i])) for i in range(batch_size) ]) test_acc_i = np.mean(counts == nat_dict_test[input_label]) print( "iter: {}, train_acc:{} test_acc:{} train_loss:{} test_loss:{} " .format(itr, train_acc_i, test_acc_i, train_loss_i, test_loss_i)) x_batch_train_adv = attack.perturb( x_batch_train.reshape(batch_size, img_size, img_size, 1), y_batch_train, sess) adv_dict_train = { input_images: x_batch_train_adv.reshape(batch_size, img_size, img_size, 1), input_label: y_batch_train } y_pred, train_adv_loss_i = sess.run([model.y_pred, model.xent], feed_dict=adv_dict_train) counts = np.asarray([ np.argmax(np.bincount(y_pred[:, i])) for i in range(batch_size) ]) train_adv_acc_i = np.mean(counts == adv_dict_train[input_label]) x_batch_test_adv = attack.perturb( x_batch_test.reshape(batch_size, img_size, img_size, 1), y_batch_test, sess) adv_dict_test = { input_images: x_batch_test_adv.reshape(batch_size, img_size, img_size, 1), input_label: y_batch_test } y_pred, test_adv_loss_i = sess.run([model.y_pred, model.xent], feed_dict=adv_dict_test) counts = np.asarray([ np.argmax(np.bincount(y_pred[:, i])) for i in range(batch_size) ]) test_adv_acc_i = np.mean(counts == adv_dict_test[input_label]) print( "iter: {}, train_adv_acc:{} test_adv_acc:{} train_adv_loss:{} test_adv_loss:{} " .format(itr, train_adv_acc_i, test_adv_acc_i, train_adv_loss_i, test_adv_loss_i)) hist['train_acc'] += [train_acc_i] hist['train_adv_acc'] += [train_adv_acc_i] hist['test_acc'] += [test_acc_i] hist['test_adv_acc'] += [test_adv_acc_i] hist['train_loss'] += [train_loss_i] hist['test_loss'] += [test_loss_i] hist['train_adv_loss'] += [train_adv_loss_i] hist['test_adv_loss'] += [test_adv_loss_i] np.save('hist', hist) saver.save(sess, 'crop_ckpt') print('done')
saver.restore(sess, model_path) total_nat_corr = 0 total_adv_corr = 0 nat_acc = 0 adv_acc = 0 # print(cifar.eval_data.xs.shape) for batch_start in range(0, data_size, batch_size): # print(batch_start) batch_end = min(batch_start + batch_size, data_size) # size = batch_end - batch_start # print(size) x_batch = cifar.eval_data.xs[batch_start:batch_end] y_batch = cifar.eval_data.ys[batch_start:batch_end] # x_batch, y_batch = cifar.eval_data.get_next_batch(batch_size, multiple_passes=True) x_batch_adv = attack.perturb(x_batch, y_batch, sess, step=100) batch_s = x_batch.shape[0] # print(batch_s) nat_dict = {model.x_input: x_batch, model.y_input: y_batch} adv_dict = {model.x_input: x_batch_adv, model.y_input: y_batch} nat_corr = sess.run(model.num_correct, feed_dict=nat_dict) adv_corr = sess.run(model.num_correct, feed_dict=adv_dict) print("batch nat corr: {}, adv corr: {}".format( nat_corr, adv_corr)) total_nat_corr += nat_corr total_adv_corr += adv_corr nat_acc = total_nat_corr / data_size
os.makedirs(model_dir) shutil.copy('config.json', model_dir) training_time = 0.0 for epoch in range(max_num_training_steps): print("Epoch: {}".format(epoch)) running_loss = 0.0 for data in tqdm(trainloader): inputs, labels = data inputs, labels = Variable(inputs), Variable(labels) optimizer.zero_grad() # Compute Adversarial Perturbations start = timer() x_adv = attack.perturb(inputs.data.numpy(), labels.data.numpy()) x_adv_v = Variable(torch.FloatTensor(x_adv)) end = timer() training_time += end - start outputs = net(x_adv_v) loss = criterion(outputs, labels) loss.backward() optimizer.step() running_loss += loss.data[0] natural_outputs = natural_net(inputs) loss = criterion(natural_outputs, labels) loss.backward() natural_optimizer.step()
shutil.copy('config.json', model_dir) with tf.Session() as sess: # Initialize the summary writer, global variables, and our time counter. summary_writer = tf.summary.FileWriter(model_dir, sess.graph) sess.run(tf.global_variables_initializer()) training_time = 0.0 # Main training loop for ii in range(max_num_training_steps): x_batch, y_batch = mnist.train.next_batch(batch_size) # Compute Adversarial Perturbations start = timer() x_batch_adv = attack.perturb(x_batch, y_batch, sess) sp_x_batch_adv = sp_attack.perturb(x_batch, y_batch, sess) convention_adv_test = attack.perturb(x_batch, y_batch, sess, False) spatial_adv_test = sp_attack.perturb(x_batch, y_batch, sess, False) end = timer() training_time += end - start nat_dict = {model.x_input: x_batch, model.y_input: y_batch} adv_dict = {model.x_input: x_batch_adv, model.y_input: y_batch} sp_nat_dict = {sp_model.x_input: x_batch, sp_model.y_input: y_batch} sp_adv_dict = {
class SpatialAttack: def __init__(self, model, config, method=None, worstofk=None, attack_limits=None, fo_epsilon=2.0, fo_step_size=2., fo_num_steps=5): self.model = model self.grid_store = [] if config.use_linf: self.linf_attack = LinfPGDAttack( model, config, fo_epsilon, fo_step_size, fo_num_steps) else: self.linf_attack = None self.use_spatial = config.use_spatial if config.use_spatial: # Attack method if method == None: self.method = config.spatial_method else: self.method = method # Attack parameters if attack_limits == None: self.limits = config.spatial_limits else: self.limits = attack_limits if config.only_rotation: self.limits = [0, 0, self.limits[2]] if config.only_translation: self.limits = [self.limits[0], self.limits[1], 0] # Attack method parameters if self.method == 'grid': self.granularity = config.grid_granularity elif self.method == 'random': if worstofk == None: self.random_tries = config.random_tries else: self.random_tries = worstofk elif self.method == 'fo': self.fo_attack = SpatialPGDAttack( model, config, fo_epsilon, fo_step_size, fo_num_steps) else: raise NotImplementedError def perturb(self, x_nat, y, max_func, sess): if not self.use_spatial: t = np.zeros([len(x_nat), 3]) if self.linf_attack: x = self.linf_attack.perturb(x_nat, y, sess, trans=t) else: x = x_nat return x, t if self.method == 'grid': return self.perturb_grid(x_nat, y, sess, -1) elif self.method == 'fo': return self.fo_attack.perturb(x_nat, y, sess) else: # random return self.perturb_grid(x_nat, y, max_func, sess, self.random_tries) def perturb_grid(self, x_nat, y, max_func, sess, random_tries=-1): n = len(x_nat) if random_tries > 0: # subsampling this list from the grid is a bad idea, instead we # will randomize each example from the full continuous range grid = [(42, 42, 42) for _ in range(random_tries)] # dummy list else: # exhaustive grid grid = product(*list(np.linspace(-l, l, num=g) for l, g in zip(self.limits, self.granularity))) worst_x = np.copy(x_nat) worst_t = np.zeros([n, 3]) k = 0 if self.linf_attack: raise NotImplementedError else: x = x_nat no_op = np.zeros([n, 3]) # computing pre_softmax of f(x), notice f(x) is not true label y if max_func == "cce": pass else: nat_dict = {self.model.x_input: x, self.model.y_input: y, self.model.is_training: False, self.model.transform: no_op} f_x_nat_presoftmax = sess.run(self.model.pre_softmax, feed_dict=nat_dict) for tx, ty, r in grid: if random_tries > 0: # randomize each example separately t = np.stack((np.random.uniform(-l, l, n) for l in self.limits), axis=1) else: t = np.stack(repeat([tx, ty, r], n)) adv_dict = {self.model.x_input: x, self.model.y_input: y, self.model.is_training: False, self.model.transform: t} if max_func == "cce": # w.r.t. the cce, not regularizer adv_loss = sess.run(self.model.y_xent, feed_dict=adv_dict) # shape (bsize,) elif max_func == "l2": # w.r.t. the regularizer f_x_adv_presoftmax = sess.run(self.model.pre_softmax, feed_dict=adv_dict) # shape (bsize,) adv_loss = self.l2_reg_loss(f_x_nat_presoftmax, f_x_adv_presoftmax) elif max_func == "kl": f_x_adv_presoftmax = sess.run(self.model.pre_softmax, feed_dict=adv_dict) # shape (bsize,) adv_loss = self.kl_reg_loss(f_x_nat_presoftmax, f_x_adv_presoftmax) else: raise NotImplementedError adv_loss = np.asarray(adv_loss) # update indices if adv_loss is larger than previous max_adv_loss if k == 0: # in first iteration update all idx = np.ones(n).astype(bool) else: idx = adv_loss > max_adv_loss idx = np.expand_dims(idx, axis=-1) # shape (bsize, 1) if k == 0: max_adv_loss = adv_loss else: max_adv_loss = np.maximum(adv_loss, max_adv_loss) worst_t = np.where(idx, t, worst_t) # shape (bsize, 3) idx = np.expand_dims(idx, axis=-1) idx = np.expand_dims(idx, axis=-1) # shape (bsize, 1, 1, 1) worst_x = np.where(idx, x, worst_x,) # shape (bsize, 32, 32, 3) k += 1 return worst_x, worst_t def l2_reg_loss(self, dist_a, dist_b): assert dist_a.shape == dist_b.shape return np.sum(np.square(dist_a - dist_b), axis=1) # pass the presoftmax in def kl_reg_loss(self, dist_nat, dist_adv): assert dist_nat.shape == dist_adv.shape # compute KL-div of f(x) and f(x') epsilon = np.zeros(dist_nat.shape) epsilon.fill(1e-08) prob_adv = scipy.special.softmax(dist_adv, axis=1) + epsilon prob_nat = scipy.special.softmax(dist_nat, axis=1) + epsilon # scipy.stats.entropy calculate the KL divergence (although it's called entropy) return scipy.stats.entropy(np.transpose(prob_nat), np.transpose(prob_adv))