Пример #1
0
def iterative_gradient(model,
                       x0,
                       pred_label,
                       step_size,
                       epsilon,
                       max_iters=80):
    x0_np = x0.cpu().numpy()
    x_after_np = np.copy(x0_np)
    # iterative perturbation
    x_after = x0.detach()
    cnt = 0
    while np.linalg.norm(x_after_np - x0_np) <= epsilon and cnt <= max_iters:
        _, _, x_after, _ = vanilla_gradient(model, x_after, pred_label,
                                            step_size)
        x_after = x_after.detach()
        x_after_np = x_after.cpu().numpy()
        cnt += 1
    x_delta = x_after - x0.cpu()
    grad_l2 = np.sum(x_delta.numpy()[:, 0, :]**2, axis=1)
    importance_score = normalize_score(grad_l2)

    model.hidden = model.init_hidden()
    pred, _ = model(x0.cpu())
    p_prior = logit2prob(pred[0].data.numpy())
    model.hidden = model.init_hidden()
    pred, _ = model(x_after.cpu())
    p_after = logit2prob(pred[0].data.numpy())
    changes_pred = p_after - p_prior
    #print(changes_pred)

    return x_delta.numpy(), importance_score, x_after, changes_pred
Пример #2
0
def integrated_gradient(model, x, pred_label, step_size=0.02, n_iters=4):
    avg_grad = None
    for n in range(1, n_iters + 1):
        x_ = float(n) / n_iters * x
        x_ = x_.detach()
        gradient, _, _, _ = vanilla_gradient(model, x_, pred_label, step_size)
        if n == 1:
            avg_grad = gradient
        else:
            avg_grad += gradient
    avg_grad /= n_iters
    inte_grad = np.multiply(avg_grad, x.detach().cpu().data.numpy())
    scale = np.sum(inte_grad, axis=-1, keepdims=True)
    intp = np.multiply(avg_grad, scale)
    grad_l2 = np.sum(intp[:, 0, :]**2, axis=1)
    importance_score = normalize_score(grad_l2) * step_size

    model.hidden = model.init_hidden()
    pred, _ = model(x.cpu())
    p_prior = logit2prob(pred[0].data.numpy())
    intp /= np.sqrt(np.sum(intp[:, 0, :]**2))  # normalize to unit length
    x_after = np.copy(x.cpu().data.numpy())
    x_after = perturb_embedding(x_after, intp * step_size)
    x_after = torch.from_numpy(x_after)
    model.hidden = model.init_hidden()
    pred, _ = model(x_after.cpu())
    p_after = logit2prob(pred[0].data.numpy())
    changes_pred = p_after - p_prior

    return inte_grad, importance_score, x_after, changes_pred, avg_grad
Пример #3
0
def smooth_gradient(model,
                    x0,
                    pred_label,
                    DEVICE,
                    step_size,
                    noise_range=0.02,
                    n_iters=20):
    smooth_grad = None
    for n in range(n_iters):
        x0_ = x0 + torch.randn(x0.shape).to(DEVICE) * noise_range
        gradient, _, _, _ = vanilla_gradient(model, x0_, pred_label)
        if n == 0:
            smooth_grad = gradient
        else:
            smooth_grad += gradient
    smooth_grad /= n_iters

    grad_l2 = np.sum(smooth_grad[:, 0, :]**2, axis=1)
    importance_score = normalize_score(grad_l2) * step_size

    model.hidden = model.init_hidden()
    pred, _ = model(x0.cpu())
    p_prior = logit2prob(pred[0].data.numpy())
    smooth_grad /= np.sqrt(np.sum(
        smooth_grad[:, 0, :]**2))  # normalize to unit length
    x_after = np.copy(x0.cpu().data.numpy())
    x_after = perturb_embedding(x_after, smooth_grad * step_size)
    x_after = torch.from_numpy(x_after)
    model.hidden = model.init_hidden()
    pred, _ = model(x_after)
    p_after = logit2prob(pred[0].data.numpy())
    changes_pred = p_after - p_prior

    return smooth_grad, importance_score, x_after, changes_pred
Пример #4
0
def evaluate_word_2zero(model,
                        row,
                        pred_label,
                        inner1,
                        inner2,
                        DEVICE,
                        MAX_RM=4):
    x0, segments_ids, input_masks = row
    pred = model(inputs_embeds=x0,
                 token_type_ids=segments_ids,
                 attention_mask=input_masks,
                 labels=None)[0]
    p_prior = logit2prob(pred[0].cpu().data.numpy())

    pred_change = np.zeros(MAX_RM)
    for n in range(1, MAX_RM + 1):
        x_after = word2zero_bert(x0.cpu().data.numpy(), input_masks, inner1,
                                 inner2, n)
        x_after = torch.from_numpy(x_after).to(DEVICE)
        pred = model(inputs_embeds=x_after,
                     token_type_ids=segments_ids,
                     attention_mask=input_masks,
                     labels=None)[0]
        p_after = logit2prob(pred[0].cpu().data.numpy())
        changes_pred = p_after - p_prior

        pred_change[n - 1] = -changes_pred[pred_label[0]]
    #print(pred_change)
    return pred_change
Пример #5
0
def vanilla_gradient(model, x, pred_label, step_size=0.02):
    model.batch_size = 1
    model.hidden = model.init_hidden()
    x = x.cpu()
    x.requires_grad = True
    pred, _ = model(x)
    x_prior = x.data.numpy()
    p_prior = logit2prob(pred[0].data.numpy())

    one_hot = np.zeros((1, 2), dtype=np.float32)
    one_hot[0][pred_label[0]] = 1
    one_hot = torch.from_numpy(one_hot)
    one_hot.requires_grad = True
    one_hot = torch.sum(one_hot * pred[0])

    gradient = grad(one_hot, x)[0].numpy()
    grad_l2 = np.sum(gradient[:, 0, :]**2, axis=1)
    importance_score = normalize_score(grad_l2) * step_size
    gradient /= np.sqrt(np.sum(gradient[:,
                                        0, :]**2))  # normalize to unit length
    x_after = np.copy(x_prior)
    x_after = perturb_embedding(x_after, gradient * step_size)

    x_after = torch.from_numpy(x_after)
    model.hidden = model.init_hidden()
    pred, _ = model(x_after)
    p_after = logit2prob(pred[0].data.numpy())
    changes_pred = p_after - p_prior
    #print(pred_label)
    #print(importance_score)
    #print(changes_pred)

    return gradient, importance_score, x_after, changes_pred
Пример #6
0
def gradient_times_input(model, row, pred_label, DEVICE, step_size=0.02):
    gradient, importance_score, x_after, changes_pred = vanilla_gradient(
        model, row, pred_label, DEVICE, step_size=step_size)
    x0, segments_ids, input_masks = row
    grad_times_input = np.multiply(gradient, x0.detach().cpu().data.numpy())
    scale = np.sum(grad_times_input, axis=-1, keepdims=True)
    intp = np.multiply(gradient, scale)
    grad_l2 = np.sum(intp[0, :, :]**2, axis=1)
    importance_score = normalize_score(grad_l2) * step_size

    pred = model(inputs_embeds=x0,
                 token_type_ids=segments_ids,
                 attention_mask=input_masks,
                 labels=None)[0]
    p_prior = logit2prob(pred[0].cpu().data.numpy())
    intp /= np.sqrt(np.sum(intp[0, :, :]**2))  # normalize to unit length
    x_after = np.copy(x0.cpu().data.numpy())
    x_after = perturb_embedding(x_after, intp * step_size)
    x_after = torch.from_numpy(x_after).to(DEVICE)
    pred = model(inputs_embeds=x_after,
                 token_type_ids=segments_ids,
                 attention_mask=input_masks,
                 labels=None)[0]
    p_after = logit2prob(pred[0].cpu().data.numpy())
    changes_pred = p_after - p_prior

    return grad_times_input, importance_score, x_after, changes_pred
Пример #7
0
def vanilla_gradient(model, row, pred_label, DEVICE, step_size=0.02):
    x, segments_ids, input_masks = row
    x.requires_grad = True
    pred = model(inputs_embeds=x,
                 token_type_ids=segments_ids,
                 attention_mask=input_masks,
                 labels=None)[0]
    x_prior = x.cpu().data.numpy()
    p_prior = logit2prob(pred[0].cpu().data.numpy())

    one_hot = np.zeros((1, 2), dtype=np.float32)
    one_hot[0][pred_label[0]] = 1
    one_hot = torch.from_numpy(one_hot).to(DEVICE)
    one_hot.requires_grad = True
    one_hot = torch.sum(one_hot * pred[0])

    gradient = grad(one_hot, x)[0].cpu().numpy()
    grad_l2 = np.sum(gradient[0, :, :]**2, axis=1)
    importance_score = normalize_score(grad_l2) * step_size
    gradient_unit = gradient / np.sqrt(np.sum(gradient[0, :, :]**
                                              2))  # normalize to unit length
    x_after = np.copy(x_prior)
    x_after = perturb_embedding(x_after, gradient_unit * step_size)

    x_after = torch.from_numpy(x_after).to(DEVICE)
    pred = model(inputs_embeds=x_after,
                 token_type_ids=segments_ids,
                 attention_mask=input_masks,
                 labels=None)[0]
    p_after = logit2prob(pred[0].cpu().data.numpy())
    changes_pred = p_after - p_prior
    # print(pred_label)
    # print(changes_pred)

    return gradient, importance_score, x_after, changes_pred
Пример #8
0
def evaluate_word_removal(model, x0, pred_label, inner1, inner2, MAX_RM=4):
    model.hidden = model.init_hidden()
    pred, _ = model(x0.cpu())
    p_prior = logit2prob(pred[0].data.numpy())

    pred_change = np.zeros(MAX_RM)
    for n in range(1, MAX_RM + 1):
        x_after = remove_word(x0.cpu().data.numpy(), inner1, inner2, n)
        x_after = torch.from_numpy(x_after)
        model.hidden = model.init_hidden()
        pred, _ = model(x_after.cpu())
        p_after = logit2prob(pred[0].data.numpy())
        changes_pred = p_after - p_prior

        pred_change[n - 1] = -changes_pred[pred_label[0]]

    return pred_change
Пример #9
0
def integrated_gradient(model,
                        row,
                        pred_label,
                        DEVICE,
                        step_size=0.02,
                        n_iters=7):
    x, segments_ids, input_masks = row
    avg_grad = None
    for n in range(1, n_iters + 1):
        x_ = float(n) / n_iters * x
        x_ = x_.detach()
        gradient, _, _, _ = vanilla_gradient(model,
                                             [x_, segments_ids, input_masks],
                                             pred_label, DEVICE)
        if n == 1:
            avg_grad = gradient
        else:
            avg_grad += gradient
    avg_grad /= n_iters
    inte_grad = np.multiply(avg_grad, x.detach().cpu().data.numpy())
    scale = np.sum(inte_grad, axis=-1, keepdims=True)
    intp = np.multiply(avg_grad, scale)
    grad_l2 = np.sum(intp[0, :, :]**2, axis=1)
    importance_score = normalize_score(grad_l2) * step_size

    pred = model(inputs_embeds=x,
                 token_type_ids=segments_ids,
                 attention_mask=input_masks,
                 labels=None)[0]
    p_prior = logit2prob(pred[0].cpu().data.numpy())
    intp /= np.sqrt(np.sum(intp[0, :, :]**2))  # normalize to unit length
    x_after = np.copy(x.cpu().data.numpy())
    x_after = perturb_embedding(x_after, intp * step_size)
    x_after = torch.from_numpy(x_after).to(DEVICE)
    pred = model(inputs_embeds=x_after,
                 token_type_ids=segments_ids,
                 attention_mask=input_masks,
                 labels=None)[0]
    p_after = logit2prob(pred[0].cpu().data.numpy())
    changes_pred = p_after - p_prior

    return inte_grad, importance_score, x_after, changes_pred
Пример #10
0
def evaluate_word_removal(model, row, inner1, inner2, DEVICE, MAX_RM=4):
    x0, segments_ids, input_masks = row
    pred = model(inputs_embeds=x0,
                 token_type_ids=segments_ids,
                 attention_mask=input_masks,
                 labels=None)[0]
    p_prior = logit2prob(pred[0].cpu().data.numpy())

    pred_change = np.zeros(MAX_RM)
    for n in range(1, MAX_RM + 1):
        input_masks_after = remove_word_bert(input_masks.cpu(), inner1, inner2,
                                             n)
        pred = model(inputs_embeds=x0,
                     token_type_ids=segments_ids,
                     attention_mask=input_masks_after.to(DEVICE),
                     labels=None)[0]
        p_after = logit2prob(pred[0].cpu().data.numpy())
        changes_pred = p_after - p_prior

        pred_change[n - 1] = np.abs(changes_pred[0])
    return pred_change
Пример #11
0
def gradient_times_input(model, x, pred_label, step_size=0.02):
    gradient, importance_score, x_after, changes_pred = vanilla_gradient(
        model, x.detach(), pred_label, step_size=step_size)
    grad_times_input = np.multiply(gradient, x.detach().cpu().data.numpy())
    scale = np.sum(grad_times_input, axis=-1, keepdims=True)
    intp = np.multiply(gradient, scale)
    grad_l2 = np.sum(intp[:, 0, :]**2, axis=1)
    importance_score = normalize_score(grad_l2) * step_size

    model.hidden = model.init_hidden()
    pred, _ = model(x.cpu())
    p_prior = logit2prob(pred[0].data.numpy())
    intp /= np.sqrt(np.sum(intp[:, 0, :]**2))  # normalize to unit length
    x_after = np.copy(x.cpu().data.numpy())
    x_after = perturb_embedding(x_after, intp * step_size)
    x_after = torch.from_numpy(x_after)
    model.hidden = model.init_hidden()
    pred, _ = model(x_after.cpu())
    p_after = logit2prob(pred[0].data.numpy())
    changes_pred = p_after - p_prior

    return intp, importance_score, x_after, changes_pred
Пример #12
0
def smooth_gradient(model, row, pred_label, DEVICE, step_size, n_iters=20):
    x0, segments_ids, input_masks = row
    noise_range = 0.4 * step_size
    smooth_grad = None
    for n in range(n_iters):
        noise = torch.randn(x0.shape)
        noise = noise / torch.sqrt(torch.sum(
            noise[0, :, :]**2)) * noise_range  # normalize noise to unit length
        x0_ = x0 + noise.to(DEVICE)
        gradient, _, _, _ = vanilla_gradient(model,
                                             [x0_, segments_ids, input_masks],
                                             pred_label, DEVICE)
        if n == 0:
            smooth_grad = gradient
        else:
            smooth_grad += gradient
    smooth_grad /= n_iters

    grad_l2 = np.sum(smooth_grad[0, :, :]**2, axis=1)
    importance_score = normalize_score(grad_l2) * step_size
    pred = model(inputs_embeds=x0,
                 token_type_ids=segments_ids,
                 attention_mask=input_masks,
                 labels=None)[0]
    p_prior = logit2prob(pred[0].cpu().data.numpy())
    smooth_grad /= np.sqrt(np.sum(
        smooth_grad[0, :, :]**2))  # normalize to unit length
    x_after = np.copy(x0.cpu().data.numpy())
    x_after = perturb_embedding(x_after, smooth_grad * step_size)
    x_after = torch.from_numpy(x_after).to(DEVICE)
    pred = model(inputs_embeds=x_after,
                 token_type_ids=segments_ids,
                 attention_mask=input_masks,
                 labels=None)[0]
    p_after = logit2prob(pred[0].cpu().data.numpy())
    changes_pred = p_after - p_prior

    return smooth_grad, importance_score, x_after, changes_pred
Пример #13
0
def iterative_gradient(model,
                       row,
                       pred_label,
                       DEVICE,
                       step_size,
                       epsilon,
                       max_iters=40):
    x0, segments_ids, input_masks = row
    x0_np = x0.cpu().numpy()
    x_after_np = np.copy(x0_np)
    # iterative perturbation
    x_after = x0.detach()
    cnt = 0
    while np.linalg.norm(x_after_np - x0_np) <= epsilon and cnt <= max_iters:
        _, _, x_after, _ = vanilla_gradient(
            model, [x_after, segments_ids, input_masks], pred_label, DEVICE,
            step_size)
        x_after = x_after.clone().detach()
        x_after_np = x_after.cpu().numpy()
        cnt += 1
    x_delta = x_after - x0
    grad_l2 = np.sum(x_delta.cpu().numpy()[0, :, :]**2, axis=1)
    importance_score = normalize_score(grad_l2)

    pred = model(inputs_embeds=x0,
                 token_type_ids=segments_ids,
                 attention_mask=input_masks,
                 labels=None)[0]
    p_prior = logit2prob(pred[0].cpu().data.numpy())
    pred = model(inputs_embeds=x_after,
                 token_type_ids=segments_ids,
                 attention_mask=input_masks,
                 labels=None)[0]
    p_after = logit2prob(pred[0].cpu().data.numpy())
    changes_pred = p_after - p_prior
    # print(changes_pred)

    return x_delta.cpu().numpy(), importance_score, x_after, changes_pred
def evaluate_attention_removal(model,
                               x0,
                               pred_label,
                               inner1,
                               inner2,
                               MAX_RM=4):
    # for attention methods, inner2 should be an all'1 vector
    mask = torch.ones([x0.shape[0], 1, 1])
    mask.requires_grad = False
    model.hidden = model.init_hidden()
    pred, _ = model(x0.cpu(), mask)
    p_prior = logit2prob(pred[0].data.numpy())

    pred_change = np.zeros(MAX_RM)
    for n in range(1, MAX_RM + 1):
        mask_new = remasking(mask, inner1, inner2, n)
        model.hidden = model.init_hidden()
        pred, _ = model(x0.cpu(), mask_new)
        p_after = logit2prob(pred[0].data.numpy())
        changes_pred = p_after - p_prior

        pred_change[n - 1] = -changes_pred[pred_label[0]]

    return pred_change