def test_attack_convergence(bn_model, bn_criterion, bn_images, bn_labels):
    attack = BoundaryAttack(bn_model, bn_criterion)
    advs = attack(bn_images, bn_labels, unpack=False, verbose=True)
    for adv in advs:
        assert adv.perturbed is not None
        assert adv.distance.value < np.inf

    attack2 = BoundaryAttack(bn_model, bn_criterion)
    bn_images2 = np.array([adv.perturbed for adv in advs])
    advs2 = attack2(bn_images2, bn_labels, unpack=False, iterations=5000)
    for adv in advs2:
        # should converge
        assert adv.perturbed is not None
        assert adv.distance.value < np.inf
Пример #2
0
def test_attack_convergence(bn_adversarial):
    adv = bn_adversarial
    attack1 = DeepFoolAttack()
    attack1(adv)
    attack2 = BoundaryAttack()
    attack2(adv, iterations=5000, verbose=True)
    # should converge
    assert adv.image is not None
    assert adv.distance.value < np.inf
def boundary_attack(model, img, target):
    img_01 = (img / 255).astype(np.float32)
    atk = BoundaryAttack(model, TargetClass(target))

    label = 1-target
    adv = atk(img_01, label, iterations=1000, verbose=False,
              log_every_n_steps=100)
    if adv is not None:
        adv = np.clip(adv * 255, 0, 255)
    return adv
Пример #4
0
def test_attack_continue(bn_adversarial):
    adv = bn_adversarial
    attack1 = BlendedUniformNoiseAttack()
    attack1(adv)
    d1 = adv.distance.value
    attack2 = BoundaryAttack()
    attack2(adv, iterations=200, verbose=True)
    assert adv.image is not None
    assert adv.distance.value < np.inf
    assert adv.distance.value < d1
def test_attack(bn_model, bn_criterion, bn_images, bn_labels):
    attack = BoundaryAttack(bn_model, bn_criterion)
    advs = attack(bn_images,
                  bn_labels,
                  unpack=False,
                  iterations=200,
                  verbose=True)
    for adv in advs:
        assert adv.perturbed is not None
        assert adv.distance.value < np.inf
Пример #6
0
def test_attack_parameters3(bn_adversarial):
    adv = bn_adversarial
    attack = BoundaryAttack()
    o = adv.original_image
    starting_point = np.random.uniform(0, 1, size=o.shape).astype(o.dtype)
    attack(adv,
           iterations=200,
           starting_point=starting_point,
           log_every_n_steps=2,
           tune_batch_size=30,
           threaded_rnd=False,
           threaded_gen=False,
           verbose=True)
    assert adv.image is not None
    assert adv.distance.value < np.inf
Пример #7
0
def test_attack_parameters(bn_adversarial):
    adv = bn_adversarial
    attack = BoundaryAttack()
    o = adv.unperturbed
    np.random.seed(2)
    starting_point = np.random.uniform(0, 1, size=o.shape).astype(o.dtype)
    attack(adv,
           iterations=200,
           starting_point=starting_point,
           log_every_n_steps=2,
           tune_batch_size=False,
           threaded_rnd=False,
           threaded_gen=False,
           alternative_generator=True,
           verbose=True)
    assert adv.perturbed is not None
    assert adv.distance.value < np.inf
def test_attack_parameters3(bn_model, bn_criterion, bn_images, bn_labels):
    attack = BoundaryAttack(bn_model, bn_criterion)
    np.random.seed(2)
    starting_point = np.random.uniform(0, 1, size=bn_images[0].shape).astype(
        bn_images.dtype)
    advs = attack(
        bn_images,
        bn_labels,
        unpack=False,
        iterations=200,
        starting_point=starting_point,
        log_every_n_steps=2,
        tune_batch_size=30,
        threaded_rnd=False,
        threaded_gen=False,
        verbose=True,
    )

    for adv in advs:
        assert adv.perturbed is not None
        assert adv.distance.value < np.inf
elif params.get('metric') == 'mahalanobis':
    psd_matrix = np.loadtxt(params.get('psd_matrix_path'), )
    psd_matrix = torch.tensor(psd_matrix,
                              dtype=torch.float,
                              device=params.get('device'))

else:
    raise Exception('unsupported metric')

knn_module = MahalanobisKnnModule(X_train, y_train, params.getint('k'),
                                  psd_matrix)
knn_module.to(params.get('device'))
knn_module.eval()

fmodel = PyTorchModel(knn_module, bounds=(0, 1), device=params.get('device'))
attack = BoundaryAttack()

n_eval = params.getint('n_eval')
perturbations_list = []

for i, (X_eval, y_eval) in enumerate(
        zip(
            torch.split(X_test[:n_eval], params.getint('attack_batch_size')),
            torch.split(y_test[:n_eval], params.getint('attack_batch_size')),
        )):
    print(i)
    _, advs, successful = attack(
        fmodel,
        X_eval,
        y_eval,
        epsilons=None,
Пример #10
0
fmodel = KerasModel(kmodel, bounds=(-1, 1))

# label of the target class
preds = kmodel.predict(dog_x)
dog_label = np.argmax(preds)

# label of the original class
preds = kmodel.predict(cat_x)
cat_label = np.argmax(preds)

criterion_1 = TopKMisclassification(k=5)
criterion_2 = TargetClass(dog_label)
criterion_3 = TargetClassProbability(dog_label, p=0.5)
criterion = criterion_1 & criterion_2 & criterion_3

attack = BoundaryAttack(model=fmodel, criterion=criterion)

iteration_size = 1000
global_iterations = 0
# Run boundary attack to generate an adversarial example
adversarial = attack(cat_img,
                     label=cat_label,
                     unpack=False,
                     iterations=iteration_size,
                     starting_point=dog_img,
                     log_every_n_steps=10,
                     verbose=True)
global_iterations += iteration_size

np.save('adversarial_image_{0}'.format(global_iterations), adversarial.image)
Пример #11
0
def main():
    X_train = np.load("./Data/sGrid/X_train.npy")
    X_test = np.load("./Data/sGrid/X_test.npy")
    X_vaild = np.load("./Data/sGrid/X_vaild.npy")
    Y_train = np.load("./Data/sGrid/Y_train.npy")
    Y_test = np.load("./Data/sGrid/Y_test.npy")
    Y_vaild = np.load("./Data/sGrid/Y_vaild.npy")

    torch.manual_seed(1)
    embedding = nn.Embedding(128, 5, max_norm=1)

    Y_train = torch.from_numpy(Y_train)
    Y_test = torch.from_numpy(Y_test)
    Y_vaild = torch.from_numpy(Y_vaild)

    input = Variable(torch.from_numpy(X_train * 128).long())
    X_train_embed = embedding(input)
    X_train_embed = X_train_embed.detach()

    input = Variable(torch.from_numpy(X_test * 128).long())
    X_test_embed = embedding(input)
    X_test_embed = X_test_embed.detach()

    input = Variable(torch.from_numpy(X_vaild * 128).long())
    X_vaild_embed = embedding(input)
    X_vaild_embed = X_vaild_embed.detach()

    dic = {}
    count = 0
    for i in range(X_train.shape[0]):
        for j in range(400):
            if chr(int(X_train[i, j] * 128)) not in dic.keys():
                dic[chr(int(X_train[i, j] * 128))] = X_train_embed[i, j]

    symbol_dict = dic

    args = Args()

    net = CNN_Text_dropout(args).cuda()
    print(net)

    pretrained_dict = torch.load(
        'Parameters/cnn_text_kernel3.5.7.9_128_embed_dropout.pkl').state_dict(
        )
    model_dict = net.state_dict()
    pretrained_dict = {
        k: v
        for k, v in pretrained_dict.items() if k in model_dict
    }
    # 更新现有的model_dict
    model_dict.update(pretrained_dict)
    # 加载我们真正需要的state_dict
    net.load_state_dict(model_dict)

    batch_size = 500
    Train_data = Data.TensorDataset(X_train_embed, Y_train)
    Test_data = Data.TensorDataset(X_test_embed, Y_test)
    train_data = Data.DataLoader(dataset=Train_data,
                                 batch_size=batch_size,
                                 shuffle=False)
    test_data = Data.DataLoader(dataset=Test_data, batch_size=1, shuffle=False)
    optimizer = optim.Adam(net.parameters(), lr=0.0001, weight_decay=1e-9)
    loss_function = nn.CrossEntropyLoss()

    attack_log_list = None
    attack_log_string_list = []

    net.eval()

    # This is the begin of the attack
    # model and boudary attack
    model = PyTorchModel(
        net,
        (-1, 1),
        2,
    )
    attack = BoundaryAttack(model)

    # find the nearest attack sample as the starting point
    X_test_string = find_string_from_tensor(X_test)
    dict_attack_string_tensor = {}
    for i in range(len(X_train)):
        x, label = X_train[i], int(Y_train[i].numpy()[0])
        # the prediction of an attack sample should be an attack
        if label == 1 and np.argmax(model.predictions(
                X_train_embed[i].numpy())) == 1:
            string = ""
            for v in x:
                string += chr(int(v * 128))
            ''' duplication of attack
            if string in dict_attack_string_tensor:
                print(string)
            '''
            dict_attack_string_tensor[string] = X_train_embed[i]

    n_test = 100
    dict_nearest_str = find_nearest_adversial(
        X_test_string[:n_test], list(dict_attack_string_tensor.keys()),
        str_similarity)
    list_X_test_nearest_tensor = []
    for log in X_test_string[:n_test]:
        list_X_test_nearest_tensor.append(
            dict_attack_string_tensor[dict_nearest_str[log]])

    # begin the attack
    try_time = 1
    max_iteration = 50
    n_success = 0
    n_total = 0
    iterations = []

    file = open(
        f'./Data/boundary_attack_unfixed_iteration_nearest_starting_max_{max_iteration}_test_{n_test}.txt',
        "w")
    for i in tqdm.tqdm_notebook(range(n_test)):
        url, label = X_test_embed[i].numpy(), int(Y_test[i].numpy()[0])
        prediction = np.argmax(model.predictions(url))

        if label == 0 and prediction == 0:
            n_total += 1
            good_adversarial = None
            good_iteration = 0
            for iteration in range(max_iteration + 1):
                adversarial = attack(
                    url,
                    label,
                    starting_point=list_X_test_nearest_tensor[i].numpy(),
                    log_every_n_steps=20,
                    iterations=iteration)

                # adversarial log
                str_adversarial = Tensor_to_Log(symbol_dict,
                                                torch.from_numpy(adversarial))
                # need to change the adversarial string back to the tensor
                prediction = np.argmax(
                    model.predictions(
                        Log_to_Tensor(symbol_dict, str_adversarial).numpy()))
                if prediction == 1:
                    good_iteration = iteration
                    good_adversarial = adversarial

            if not good_adversarial is None:
                n_success += 1
                iterations.append(good_iteration)
                # original log
                file.write(X_test_string[i])
                file.write("\n")

                # adversarial log
                file.write(
                    Tensor_to_Log(symbol_dict,
                                  torch.from_numpy(good_adversarial)))
                file.write("\n\n")

    file.close()
Пример #12
0
def test_attack_gl(gl_bn_adversarial):
    adv = gl_bn_adversarial
    attack = BoundaryAttack()
    attack(adv, iterations=200, verbose=True)
    assert adv.image is not None
    assert adv.distance.value < np.inf
Пример #13
0
def test_attack_parameters2(bn_adversarial):
    adv = bn_adversarial
    attack = BoundaryAttack()
    attack(adv, iterations=200, alternative_generator=True, verbose=True)
    assert adv.image is not None
    assert adv.distance.value < np.inf
Пример #14
0
def test_attack_non_verbose(bn_adversarial):
    adv = bn_adversarial
    attack = BoundaryAttack()
    attack(adv, iterations=200, verbose=False)
    assert adv.image is not None
    assert adv.distance.value < np.inf
Пример #15
0
def test_attack(bn_adversarial):
    adv = bn_adversarial
    attack = BoundaryAttack()
    attack(adv, iterations=200, verbose=True)
    assert adv.perturbed is not None
    assert adv.distance.value < np.inf
Пример #16
0
def test_attack_impossible(bn_impossible):
    adv = bn_impossible
    attack = BoundaryAttack()
    attack(adv, iterations=200, verbose=True)
    assert adv.image is None
    assert adv.distance.value == np.inf
Пример #17
0
def attack_run_rejection_policy(model, hps):
    """
    An attack run with rejection policy.
    :param model: Pytorch model.
    :param adversary: Advertorch adversary.
    :param hps: hyperparameters
    :return:
    """
    model.eval()

    # Get thresholds
    threshold_list1 = []
    threshold_list2 = []
    for label_id in range(hps.n_classes):
        # No data augmentation(crop_flip=False) when getting in-distribution thresholds
        dataset = get_dataset(data_name=hps.problem,
                              train=True,
                              label_id=label_id,
                              crop_flip=False)
        in_test_loader = DataLoader(dataset=dataset,
                                    batch_size=hps.n_batch_test,
                                    shuffle=False)

        print('Inference on {}, label_id {}'.format(hps.problem, label_id))
        in_ll_list = []
        for batch_id, (x, y) in enumerate(in_test_loader):
            x = x.to(hps.device)
            y = y.to(hps.device)
            ll = model(x)

            correct_idx = ll.argmax(dim=1) == y

            ll_, y_ = ll[correct_idx], y[
                correct_idx]  # choose samples are classified correctly
            in_ll_list += list(ll_[:, label_id].detach().cpu().numpy())

        thresh_idx = int(0.01 * len(in_ll_list))
        thresh1 = sorted(in_ll_list)[thresh_idx]
        thresh_idx = int(0.02 * len(in_ll_list))
        thresh2 = sorted(in_ll_list)[thresh_idx]
        threshold_list1.append(thresh1)  # class mean as threshold
        threshold_list2.append(thresh2)  # class mean as threshold
        print('1st & 2nd percentile thresholds: {:.3f}, {:.3f}'.format(
            thresh1, thresh2))

    # Evaluation
    n_eval = 0  # total number of correct classified samples by clean classifier
    n_successful_adv = 0  # total number of successful adversarial examples generated
    n_rejected_adv1 = 0  # total number of successfully rejected (successful) adversarial examples, <= n_successful_adv
    n_rejected_adv2 = 0  # total number of successfully rejected (successful) adversarial examples, <= n_successful_adv

    attack_path = os.path.join(hps.attack_dir, hps.attack)
    if not os.path.exists(attack_path):
        os.mkdir(attack_path)

    thresholds1 = torch.tensor(threshold_list1).to(hps.device)
    thresholds2 = torch.tensor(threshold_list2).to(hps.device)

    l2_distortion_list = []

    fmodel = foolbox.models.PyTorchModel(model, bounds=(0, 1.), num_classes=10)

    hps.n_batch_test = 1
    dataset = get_dataset(data_name=hps.problem, train=False)
    test_loader = DataLoader(dataset=dataset,
                             batch_size=hps.n_batch_test,
                             shuffle=False)

    for batch_id, (x, y) in enumerate(test_loader):
        # Note that images are scaled to [0., 1.0]
        x, y = x.to(hps.device), y.to(hps.device)
        with torch.no_grad():
            output = model(x)

        pred = output.argmax(dim=1)
        if pred != y:
            continue

        n_eval += 1

        img, label = x[0], y[0]
        if hps.attack == 'boundary':
            attack = BoundaryAttack(fmodel)
            adv_x = attack(img.cpu().numpy(),
                           label.cpu().numpy(),
                           log_every_n_steps=10000)
        elif hps.attack == 'deepfool':
            attack = DeepFoolL2Attack(fmodel)
            adv_x = attack(img.cpu().numpy(), label.cpu().numpy())
        elif hps.attack == 'local':
            attack = LocalSearchAttack(fmodel)
            adv_x = attack(img.cpu().numpy(), label.cpu().numpy())
        elif hps.attack == 'spatial':
            attack = SpatialAttack(fmodel)
            adv_x = attack(img.cpu().numpy(), label.cpu().numpy())
        elif hps.attack == 'jsma':
            attack = SpatialAttack(fmodel)
            adv_x = attack(img.cpu().numpy(), label.cpu().numpy())
        else:
            raise ValueError('param attack {} not available.'.format(
                hps.attack))

        adv_x = torch.tensor(adv_x).unsqueeze(dim=0).to(hps.device)

        with torch.no_grad():
            output = model(adv_x)

        logit, pred = output.max(dim=1)

        if pred != label:
            n_successful_adv += 1

        diff = adv_x - x
        l2_distortion = diff.norm(p=2,
                                  dim=-1).mean().item()  # mean l2 distortion
        l2_distortion_list.append(l2_distortion)

        if logit < thresholds1[pred]:
            n_rejected_adv1 += 1

        if logit < thresholds2[pred]:
            n_rejected_adv2 += 1

        if batch_id == 100:
            print('Evaluating on {}-th batch ...'.format(batch_id))
            break  # only one batch

    reject_rate1 = n_rejected_adv1 / n_successful_adv
    reject_rate2 = n_rejected_adv2 / n_successful_adv
    success_adv_rate = n_successful_adv / n_eval
    print('success rate of adv examples generation: {}/{}={:.4f}'.format(
        n_successful_adv, n_eval, success_adv_rate))
    print('Mean L2 distortion of Adv Examples: {:.4f}'.format(
        np.mean(l2_distortion_list)))
    print('1st percentile, reject success rate: {}/{}={:.4f}'.format(
        n_rejected_adv1, n_successful_adv, reject_rate1))
    print('2nd percentile, reject success rate: {}/{}={:.4f}'.format(
        n_rejected_adv2, n_successful_adv, reject_rate2))