def cw_attack(model, dataloader, mean, std): inputs_box = (min((0 - m) / s for m, s in zip(mean, std)), max((1 - m) / s for m, s in zip(mean, std))) # an untargeted adversary adversary = cw.L2Adversary(targeted=False, confidence=0.0, search_steps=10, box=inputs_box, optimizer_lr=5e-4) inputs, targets = next(iter(dataloader)) adversarial_examples = adversary(model, inputs, targets, to_numpy=False) # a targeted adversary adversary = cw.L2Adversary(targeted=True, confidence=0.0, search_steps=10, box=inputs_box, optimizer_lr=5e-4) inputs, _ = next(iter(dataloader)) # a batch of any attack targets attack_targets = torch.ones(inputs.size(0)) * 3 adversarial_examples = adversary(net, inputs, attack_targets, to_numpy=False)
return img, label def __len__(self): return len(self.imgpath_list) testloader = torch.utils.data.DataLoader(testdata(), batch_size=bs, shuffle=False, num_workers=1) inputs_box = (min((0 - m) / s for m, s in zip(mean, std)), max((1 - m) / s for m, s in zip(mean, std))) for k in net_dict.keys(): print(k) net = net_dict[k] for c in confidences: os.mkdir('CW/'+k+'_'+str(int(c))) adversary = cw.L2Adversary(targeted=False, confidence=c, search_steps=30, box=inputs_box, optimizer_lr=5e-4) total = 0 correct = 0 count = 0 norm = np.array([]) for data in testloader: inputs, targets = data inputs = inputs.cuda() targets = targets.type(torch.LongTensor).cuda() adversarial_examples = adversary(net, inputs, targets, to_numpy=False) outputs_norm = net(inputs) outputs_ae = net(adversarial_examples) _, labels_norm = outputs_norm.max(1) _, labels_ae = outputs_ae.max(1)
def test_robust(opt, model, classifier, attack_method, c, norm=None): if opt.attack == 'FGSM': adv_crafter = FastGradientMethod(classifier, norm=norm, eps=c, targeted=False, num_random_init=0, batch_size=opt.bs) if opt.attack == 'PGD': adv_crafter = ProjectedGradientDescent(classifier, norm=norm, eps=c, eps_step=c / 10., max_iter=10, targeted=False, num_random_init=1, batch_size=opt.bs) if opt.attack == 'BIM': adv_crafter = ProjectedGradientDescent(classifier, norm=norm, eps=c, eps_step=c / 10., max_iter=10, targeted=False, num_random_init=0, batch_size=bs) if opt.attack == 'JSMA': adv_crafter = SaliencyMapMethod(classifier, theta=0.1, gamma=c, batch_size=opt.bs) if opt.attack == 'CW': adv_crafter = cw.L2Adversary(targeted=False, confidence=0.01, c_range=(c, 1e10), max_steps=1000, abort_early=False, search_steps=5, box=(0., 1.0), optimizer_lr=0.01) correct = 0 total = 0 total_sum = 0 common_id = [] for batch_idx, (inputs, targets) in enumerate(testloader): inputs, targets = inputs.cuda(), targets.cuda() output = classifier.predict(inputs.cpu().numpy(), batch_size=opt.bs) output = torch.tensor(output) output = output.cuda() init_pred = output.max(1, keepdim=False)[1] common_id = np.where( init_pred.cpu().numpy() == targets.cpu().numpy())[0] if opt.attack == 'CW': x_test_adv = adv_crafter(model, inputs, targets, to_numpy=True) else: x_test_adv = adv_crafter.generate(x=inputs.cpu().numpy()) perturbed_output = classifier.predict(x_test_adv) perturbed_output = torch.tensor(perturbed_output) perturbed_output = perturbed_output.cuda() final_pred = perturbed_output.max(1, keepdim=False)[1] total_sum += targets.size(0) total += len(common_id) correct += final_pred[common_id].eq( targets[common_id].data).cpu().sum() attack_acc = 100. * float(correct) / total progress.progress_bar( batch_idx, len(testloader), 'Attack Strength:%.3f, robust accuracy: %.3f%% (%d/%d)' '' % (c, attack_acc, correct, total))
def attack(model, criterion, img, label, eps, attack_type, iters, mean, std, dataset='mnist', black_box=False): # adv = img.detach() adv = img.clone() adv.requires_grad = True if attack_type == 'fgsm' or attack_type == 'cw': iterations = 1 else: iterations = iters if attack_type == 'pgd': step = 2 / 255 else: step = eps / iterations noise = 0 if dataset == 'mnist' or dataset == 'fmnist': for j in range(iterations): if black_box == False: out_adv, _, _ = model(normalize(adv.clone(), mean, std, dataset)) else: out_adv = model(normalize(adv.clone(), mean, std, dataset)) loss = criterion(out_adv, label) # loss = F.nll_loss(out_adv, label) loss.backward() if attack_type == 'mim': adv_mean = torch.mean(torch.abs(adv.grad), dim=1, keepdim=True) adv.grad = adv.grad / adv_mean noise = noise + adv.grad else: noise = adv.grad # Optimization step adv.data = adv.data + step * noise.sign() # adv.data = adv.data + step * adv.grad.sign() if attack_type == 'pgd': adv.data = torch.where(adv.data > img.data + eps, img.data + eps, adv.data) adv.data = torch.where(adv.data < img.data - eps, img.data - eps, adv.data) adv.data.clamp_(0.0, 1.0) if attack_type == 'cw': inputs_box = (min((0 - m) / s for m, s in zip(mean, std)), max((1 - m) / s for m, s in zip(mean, std))) # an untargeted adversary adversary = cw.L2Adversary(targeted=False, confidence=0.0, search_steps=iters, box=inputs_box, optimizer_lr=5e-4) adv = adversary(model, img, label, to_numpy=False) adv.grad.data.zero_() else: for j in range(iterations): if black_box == False: out_adv, _, _ = model(normalize(adv.clone(), mean, std, dataset)) else: out_adv = model(normalize(adv.clone(), mean, std, dataset)) loss = criterion(out_adv, label) loss.backward() if attack_type == 'mim': adv_mean = torch.mean(torch.abs(adv.grad), dim=1, keepdim=True) adv_mean = torch.mean(torch.abs(adv_mean), dim=2, keepdim=True) adv_mean = torch.mean(torch.abs(adv_mean), dim=3, keepdim=True) adv.grad = adv.grad / adv_mean noise = noise + adv.grad else: noise = adv.grad # Optimization step adv.data = adv.data + step * noise.sign() # adv.data = adv.data + step * adv.grad.sign() if attack_type == 'pgd': adv.data = torch.where(adv.data > img.data + eps, img.data + eps, adv.data) adv.data = torch.where(adv.data < img.data - eps, img.data - eps, adv.data) adv.data.clamp_(0.0, 1.0) if attack_type == 'cw': inputs_box = (min((0 - m) / s for m, s in zip(mean, std)), max((1 - m) / s for m, s in zip(mean, std))) # an untargeted adversary adversary = cw.L2Adversary(targeted=False, confidence=0.0, search_steps=iters, box=inputs_box, optimizer_lr=1e-2) adv = adversary(model, img, label, to_numpy=False) adv.grad.data.zero_() return adv.detach()