def cw_attack(model, dataloader, mean, std):
    inputs_box = (min((0 - m) / s for m, s in zip(mean, std)),
                max((1 - m) / s for m, s in zip(mean, std)))
    # an untargeted adversary
    adversary = cw.L2Adversary(targeted=False,
                            confidence=0.0,
                            search_steps=10,
                            box=inputs_box,
                            optimizer_lr=5e-4)

    inputs, targets = next(iter(dataloader))
    adversarial_examples = adversary(model, inputs, targets, to_numpy=False)

    # a targeted adversary
    adversary = cw.L2Adversary(targeted=True,
                            confidence=0.0,
                            search_steps=10,
                            box=inputs_box,
                            optimizer_lr=5e-4)

    inputs, _ = next(iter(dataloader))
    # a batch of any attack targets
    attack_targets = torch.ones(inputs.size(0)) * 3
    adversarial_examples = adversary(net, inputs, attack_targets, to_numpy=False)
Exemplo n.º 2
0
        return img, label
    def __len__(self):
        return len(self.imgpath_list)

testloader = torch.utils.data.DataLoader(testdata(), batch_size=bs, shuffle=False, num_workers=1)
inputs_box = (min((0 - m) / s for m, s in zip(mean, std)),
              max((1 - m) / s for m, s in zip(mean, std)))

for k in net_dict.keys():
    print(k)
    net = net_dict[k]
    for c in confidences:
        os.mkdir('CW/'+k+'_'+str(int(c)))
        adversary = cw.L2Adversary(targeted=False,
                                   confidence=c,
                                   search_steps=30,
                                   box=inputs_box,
                                   optimizer_lr=5e-4)
        total = 0
        correct = 0
        count = 0
        norm = np.array([])
        for data in testloader:
            inputs, targets = data
            inputs = inputs.cuda()
            targets = targets.type(torch.LongTensor).cuda()
            adversarial_examples = adversary(net, inputs, targets, to_numpy=False)
            outputs_norm = net(inputs)
            outputs_ae   = net(adversarial_examples)
            _, labels_norm = outputs_norm.max(1)
            _, labels_ae   = outputs_ae.max(1)
Exemplo n.º 3
0
def test_robust(opt, model, classifier, attack_method, c, norm=None):
    if opt.attack == 'FGSM':
        adv_crafter = FastGradientMethod(classifier,
                                         norm=norm,
                                         eps=c,
                                         targeted=False,
                                         num_random_init=0,
                                         batch_size=opt.bs)
    if opt.attack == 'PGD':
        adv_crafter = ProjectedGradientDescent(classifier,
                                               norm=norm,
                                               eps=c,
                                               eps_step=c / 10.,
                                               max_iter=10,
                                               targeted=False,
                                               num_random_init=1,
                                               batch_size=opt.bs)
    if opt.attack == 'BIM':
        adv_crafter = ProjectedGradientDescent(classifier,
                                               norm=norm,
                                               eps=c,
                                               eps_step=c / 10.,
                                               max_iter=10,
                                               targeted=False,
                                               num_random_init=0,
                                               batch_size=bs)
    if opt.attack == 'JSMA':
        adv_crafter = SaliencyMapMethod(classifier,
                                        theta=0.1,
                                        gamma=c,
                                        batch_size=opt.bs)
    if opt.attack == 'CW':
        adv_crafter = cw.L2Adversary(targeted=False,
                                     confidence=0.01,
                                     c_range=(c, 1e10),
                                     max_steps=1000,
                                     abort_early=False,
                                     search_steps=5,
                                     box=(0., 1.0),
                                     optimizer_lr=0.01)

    correct = 0
    total = 0
    total_sum = 0
    common_id = []
    for batch_idx, (inputs, targets) in enumerate(testloader):
        inputs, targets = inputs.cuda(), targets.cuda()
        output = classifier.predict(inputs.cpu().numpy(), batch_size=opt.bs)
        output = torch.tensor(output)
        output = output.cuda()
        init_pred = output.max(1, keepdim=False)[1]
        common_id = np.where(
            init_pred.cpu().numpy() == targets.cpu().numpy())[0]

        if opt.attack == 'CW':
            x_test_adv = adv_crafter(model, inputs, targets, to_numpy=True)
        else:
            x_test_adv = adv_crafter.generate(x=inputs.cpu().numpy())

        perturbed_output = classifier.predict(x_test_adv)
        perturbed_output = torch.tensor(perturbed_output)
        perturbed_output = perturbed_output.cuda()
        final_pred = perturbed_output.max(1, keepdim=False)[1]
        total_sum += targets.size(0)
        total += len(common_id)
        correct += final_pred[common_id].eq(
            targets[common_id].data).cpu().sum()
        attack_acc = 100. * float(correct) / total

        progress.progress_bar(
            batch_idx, len(testloader),
            'Attack Strength:%.3f, robust accuracy: %.3f%% (%d/%d)'
            '' % (c, attack_acc, correct, total))
Exemplo n.º 4
0
def attack(model, criterion, img, label, eps, attack_type, iters, mean, std, dataset='mnist', black_box=False):
    # adv = img.detach()
    adv = img.clone()
    adv.requires_grad = True

    if attack_type == 'fgsm' or attack_type == 'cw':
        iterations = 1
    else:
        iterations = iters

    if attack_type == 'pgd':
        step = 2 / 255
    else:
        step = eps / iterations

        noise = 0

    if dataset == 'mnist' or dataset == 'fmnist':
        for j in range(iterations):
            if black_box == False:
                out_adv, _, _ = model(normalize(adv.clone(), mean, std, dataset))
            else:
                out_adv = model(normalize(adv.clone(), mean, std, dataset))
            loss = criterion(out_adv, label)
            # loss = F.nll_loss(out_adv, label)
            loss.backward()

            if attack_type == 'mim':
                adv_mean = torch.mean(torch.abs(adv.grad), dim=1, keepdim=True)
                adv.grad = adv.grad / adv_mean
                noise = noise + adv.grad
            else:
                noise = adv.grad

            # Optimization step
            adv.data = adv.data + step * noise.sign()
            #        adv.data = adv.data + step * adv.grad.sign()

            if attack_type == 'pgd':
                adv.data = torch.where(adv.data > img.data + eps, img.data + eps, adv.data)
                adv.data = torch.where(adv.data < img.data - eps, img.data - eps, adv.data)

            adv.data.clamp_(0.0, 1.0)

            if attack_type == 'cw':
                inputs_box = (min((0 - m) / s for m, s in zip(mean, std)),
                              max((1 - m) / s for m, s in zip(mean, std)))
                # an untargeted adversary
                adversary = cw.L2Adversary(targeted=False,
                                           confidence=0.0,
                                           search_steps=iters,
                                           box=inputs_box,
                                           optimizer_lr=5e-4)

                adv = adversary(model, img, label, to_numpy=False)

            adv.grad.data.zero_()
    else:
        for j in range(iterations):
            if black_box == False:
                out_adv, _, _ = model(normalize(adv.clone(), mean, std, dataset))
            else:
                out_adv = model(normalize(adv.clone(), mean, std, dataset))
            loss = criterion(out_adv, label)
            loss.backward()

            if attack_type == 'mim':
                adv_mean = torch.mean(torch.abs(adv.grad), dim=1, keepdim=True)
                adv_mean = torch.mean(torch.abs(adv_mean), dim=2, keepdim=True)
                adv_mean = torch.mean(torch.abs(adv_mean), dim=3, keepdim=True)
                adv.grad = adv.grad / adv_mean
                noise = noise + adv.grad
            else:
                noise = adv.grad

            # Optimization step
            adv.data = adv.data + step * noise.sign()
            #        adv.data = adv.data + step * adv.grad.sign()

            if attack_type == 'pgd':
                adv.data = torch.where(adv.data > img.data + eps, img.data + eps, adv.data)
                adv.data = torch.where(adv.data < img.data - eps, img.data - eps, adv.data)
            adv.data.clamp_(0.0, 1.0)

            if attack_type == 'cw':
                inputs_box = (min((0 - m) / s for m, s in zip(mean, std)),
                              max((1 - m) / s for m, s in zip(mean, std)))
                # an untargeted adversary
                adversary = cw.L2Adversary(targeted=False,
                                           confidence=0.0,
                                           search_steps=iters,
                                           box=inputs_box,
                                           optimizer_lr=1e-2)

                adv = adversary(model, img, label, to_numpy=False)

            adv.grad.data.zero_()

    return adv.detach()