def iterative_gradient(model, x0, pred_label, step_size, epsilon, max_iters=80): x0_np = x0.cpu().numpy() x_after_np = np.copy(x0_np) # iterative perturbation x_after = x0.detach() cnt = 0 while np.linalg.norm(x_after_np - x0_np) <= epsilon and cnt <= max_iters: _, _, x_after, _ = vanilla_gradient(model, x_after, pred_label, step_size) x_after = x_after.detach() x_after_np = x_after.cpu().numpy() cnt += 1 x_delta = x_after - x0.cpu() grad_l2 = np.sum(x_delta.numpy()[:, 0, :]**2, axis=1) importance_score = normalize_score(grad_l2) model.hidden = model.init_hidden() pred, _ = model(x0.cpu()) p_prior = logit2prob(pred[0].data.numpy()) model.hidden = model.init_hidden() pred, _ = model(x_after.cpu()) p_after = logit2prob(pred[0].data.numpy()) changes_pred = p_after - p_prior #print(changes_pred) return x_delta.numpy(), importance_score, x_after, changes_pred
def integrated_gradient(model, x, pred_label, step_size=0.02, n_iters=4): avg_grad = None for n in range(1, n_iters + 1): x_ = float(n) / n_iters * x x_ = x_.detach() gradient, _, _, _ = vanilla_gradient(model, x_, pred_label, step_size) if n == 1: avg_grad = gradient else: avg_grad += gradient avg_grad /= n_iters inte_grad = np.multiply(avg_grad, x.detach().cpu().data.numpy()) scale = np.sum(inte_grad, axis=-1, keepdims=True) intp = np.multiply(avg_grad, scale) grad_l2 = np.sum(intp[:, 0, :]**2, axis=1) importance_score = normalize_score(grad_l2) * step_size model.hidden = model.init_hidden() pred, _ = model(x.cpu()) p_prior = logit2prob(pred[0].data.numpy()) intp /= np.sqrt(np.sum(intp[:, 0, :]**2)) # normalize to unit length x_after = np.copy(x.cpu().data.numpy()) x_after = perturb_embedding(x_after, intp * step_size) x_after = torch.from_numpy(x_after) model.hidden = model.init_hidden() pred, _ = model(x_after.cpu()) p_after = logit2prob(pred[0].data.numpy()) changes_pred = p_after - p_prior return inte_grad, importance_score, x_after, changes_pred, avg_grad
def smooth_gradient(model, x0, pred_label, DEVICE, step_size, noise_range=0.02, n_iters=20): smooth_grad = None for n in range(n_iters): x0_ = x0 + torch.randn(x0.shape).to(DEVICE) * noise_range gradient, _, _, _ = vanilla_gradient(model, x0_, pred_label) if n == 0: smooth_grad = gradient else: smooth_grad += gradient smooth_grad /= n_iters grad_l2 = np.sum(smooth_grad[:, 0, :]**2, axis=1) importance_score = normalize_score(grad_l2) * step_size model.hidden = model.init_hidden() pred, _ = model(x0.cpu()) p_prior = logit2prob(pred[0].data.numpy()) smooth_grad /= np.sqrt(np.sum( smooth_grad[:, 0, :]**2)) # normalize to unit length x_after = np.copy(x0.cpu().data.numpy()) x_after = perturb_embedding(x_after, smooth_grad * step_size) x_after = torch.from_numpy(x_after) model.hidden = model.init_hidden() pred, _ = model(x_after) p_after = logit2prob(pred[0].data.numpy()) changes_pred = p_after - p_prior return smooth_grad, importance_score, x_after, changes_pred
def evaluate_word_2zero(model, row, pred_label, inner1, inner2, DEVICE, MAX_RM=4): x0, segments_ids, input_masks = row pred = model(inputs_embeds=x0, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_prior = logit2prob(pred[0].cpu().data.numpy()) pred_change = np.zeros(MAX_RM) for n in range(1, MAX_RM + 1): x_after = word2zero_bert(x0.cpu().data.numpy(), input_masks, inner1, inner2, n) x_after = torch.from_numpy(x_after).to(DEVICE) pred = model(inputs_embeds=x_after, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_after = logit2prob(pred[0].cpu().data.numpy()) changes_pred = p_after - p_prior pred_change[n - 1] = -changes_pred[pred_label[0]] #print(pred_change) return pred_change
def vanilla_gradient(model, x, pred_label, step_size=0.02): model.batch_size = 1 model.hidden = model.init_hidden() x = x.cpu() x.requires_grad = True pred, _ = model(x) x_prior = x.data.numpy() p_prior = logit2prob(pred[0].data.numpy()) one_hot = np.zeros((1, 2), dtype=np.float32) one_hot[0][pred_label[0]] = 1 one_hot = torch.from_numpy(one_hot) one_hot.requires_grad = True one_hot = torch.sum(one_hot * pred[0]) gradient = grad(one_hot, x)[0].numpy() grad_l2 = np.sum(gradient[:, 0, :]**2, axis=1) importance_score = normalize_score(grad_l2) * step_size gradient /= np.sqrt(np.sum(gradient[:, 0, :]**2)) # normalize to unit length x_after = np.copy(x_prior) x_after = perturb_embedding(x_after, gradient * step_size) x_after = torch.from_numpy(x_after) model.hidden = model.init_hidden() pred, _ = model(x_after) p_after = logit2prob(pred[0].data.numpy()) changes_pred = p_after - p_prior #print(pred_label) #print(importance_score) #print(changes_pred) return gradient, importance_score, x_after, changes_pred
def gradient_times_input(model, row, pred_label, DEVICE, step_size=0.02): gradient, importance_score, x_after, changes_pred = vanilla_gradient( model, row, pred_label, DEVICE, step_size=step_size) x0, segments_ids, input_masks = row grad_times_input = np.multiply(gradient, x0.detach().cpu().data.numpy()) scale = np.sum(grad_times_input, axis=-1, keepdims=True) intp = np.multiply(gradient, scale) grad_l2 = np.sum(intp[0, :, :]**2, axis=1) importance_score = normalize_score(grad_l2) * step_size pred = model(inputs_embeds=x0, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_prior = logit2prob(pred[0].cpu().data.numpy()) intp /= np.sqrt(np.sum(intp[0, :, :]**2)) # normalize to unit length x_after = np.copy(x0.cpu().data.numpy()) x_after = perturb_embedding(x_after, intp * step_size) x_after = torch.from_numpy(x_after).to(DEVICE) pred = model(inputs_embeds=x_after, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_after = logit2prob(pred[0].cpu().data.numpy()) changes_pred = p_after - p_prior return grad_times_input, importance_score, x_after, changes_pred
def vanilla_gradient(model, row, pred_label, DEVICE, step_size=0.02): x, segments_ids, input_masks = row x.requires_grad = True pred = model(inputs_embeds=x, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] x_prior = x.cpu().data.numpy() p_prior = logit2prob(pred[0].cpu().data.numpy()) one_hot = np.zeros((1, 2), dtype=np.float32) one_hot[0][pred_label[0]] = 1 one_hot = torch.from_numpy(one_hot).to(DEVICE) one_hot.requires_grad = True one_hot = torch.sum(one_hot * pred[0]) gradient = grad(one_hot, x)[0].cpu().numpy() grad_l2 = np.sum(gradient[0, :, :]**2, axis=1) importance_score = normalize_score(grad_l2) * step_size gradient_unit = gradient / np.sqrt(np.sum(gradient[0, :, :]** 2)) # normalize to unit length x_after = np.copy(x_prior) x_after = perturb_embedding(x_after, gradient_unit * step_size) x_after = torch.from_numpy(x_after).to(DEVICE) pred = model(inputs_embeds=x_after, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_after = logit2prob(pred[0].cpu().data.numpy()) changes_pred = p_after - p_prior # print(pred_label) # print(changes_pred) return gradient, importance_score, x_after, changes_pred
def evaluate_word_removal(model, x0, pred_label, inner1, inner2, MAX_RM=4): model.hidden = model.init_hidden() pred, _ = model(x0.cpu()) p_prior = logit2prob(pred[0].data.numpy()) pred_change = np.zeros(MAX_RM) for n in range(1, MAX_RM + 1): x_after = remove_word(x0.cpu().data.numpy(), inner1, inner2, n) x_after = torch.from_numpy(x_after) model.hidden = model.init_hidden() pred, _ = model(x_after.cpu()) p_after = logit2prob(pred[0].data.numpy()) changes_pred = p_after - p_prior pred_change[n - 1] = -changes_pred[pred_label[0]] return pred_change
def integrated_gradient(model, row, pred_label, DEVICE, step_size=0.02, n_iters=7): x, segments_ids, input_masks = row avg_grad = None for n in range(1, n_iters + 1): x_ = float(n) / n_iters * x x_ = x_.detach() gradient, _, _, _ = vanilla_gradient(model, [x_, segments_ids, input_masks], pred_label, DEVICE) if n == 1: avg_grad = gradient else: avg_grad += gradient avg_grad /= n_iters inte_grad = np.multiply(avg_grad, x.detach().cpu().data.numpy()) scale = np.sum(inte_grad, axis=-1, keepdims=True) intp = np.multiply(avg_grad, scale) grad_l2 = np.sum(intp[0, :, :]**2, axis=1) importance_score = normalize_score(grad_l2) * step_size pred = model(inputs_embeds=x, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_prior = logit2prob(pred[0].cpu().data.numpy()) intp /= np.sqrt(np.sum(intp[0, :, :]**2)) # normalize to unit length x_after = np.copy(x.cpu().data.numpy()) x_after = perturb_embedding(x_after, intp * step_size) x_after = torch.from_numpy(x_after).to(DEVICE) pred = model(inputs_embeds=x_after, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_after = logit2prob(pred[0].cpu().data.numpy()) changes_pred = p_after - p_prior return inte_grad, importance_score, x_after, changes_pred
def evaluate_word_removal(model, row, inner1, inner2, DEVICE, MAX_RM=4): x0, segments_ids, input_masks = row pred = model(inputs_embeds=x0, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_prior = logit2prob(pred[0].cpu().data.numpy()) pred_change = np.zeros(MAX_RM) for n in range(1, MAX_RM + 1): input_masks_after = remove_word_bert(input_masks.cpu(), inner1, inner2, n) pred = model(inputs_embeds=x0, token_type_ids=segments_ids, attention_mask=input_masks_after.to(DEVICE), labels=None)[0] p_after = logit2prob(pred[0].cpu().data.numpy()) changes_pred = p_after - p_prior pred_change[n - 1] = np.abs(changes_pred[0]) return pred_change
def gradient_times_input(model, x, pred_label, step_size=0.02): gradient, importance_score, x_after, changes_pred = vanilla_gradient( model, x.detach(), pred_label, step_size=step_size) grad_times_input = np.multiply(gradient, x.detach().cpu().data.numpy()) scale = np.sum(grad_times_input, axis=-1, keepdims=True) intp = np.multiply(gradient, scale) grad_l2 = np.sum(intp[:, 0, :]**2, axis=1) importance_score = normalize_score(grad_l2) * step_size model.hidden = model.init_hidden() pred, _ = model(x.cpu()) p_prior = logit2prob(pred[0].data.numpy()) intp /= np.sqrt(np.sum(intp[:, 0, :]**2)) # normalize to unit length x_after = np.copy(x.cpu().data.numpy()) x_after = perturb_embedding(x_after, intp * step_size) x_after = torch.from_numpy(x_after) model.hidden = model.init_hidden() pred, _ = model(x_after.cpu()) p_after = logit2prob(pred[0].data.numpy()) changes_pred = p_after - p_prior return intp, importance_score, x_after, changes_pred
def smooth_gradient(model, row, pred_label, DEVICE, step_size, n_iters=20): x0, segments_ids, input_masks = row noise_range = 0.4 * step_size smooth_grad = None for n in range(n_iters): noise = torch.randn(x0.shape) noise = noise / torch.sqrt(torch.sum( noise[0, :, :]**2)) * noise_range # normalize noise to unit length x0_ = x0 + noise.to(DEVICE) gradient, _, _, _ = vanilla_gradient(model, [x0_, segments_ids, input_masks], pred_label, DEVICE) if n == 0: smooth_grad = gradient else: smooth_grad += gradient smooth_grad /= n_iters grad_l2 = np.sum(smooth_grad[0, :, :]**2, axis=1) importance_score = normalize_score(grad_l2) * step_size pred = model(inputs_embeds=x0, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_prior = logit2prob(pred[0].cpu().data.numpy()) smooth_grad /= np.sqrt(np.sum( smooth_grad[0, :, :]**2)) # normalize to unit length x_after = np.copy(x0.cpu().data.numpy()) x_after = perturb_embedding(x_after, smooth_grad * step_size) x_after = torch.from_numpy(x_after).to(DEVICE) pred = model(inputs_embeds=x_after, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_after = logit2prob(pred[0].cpu().data.numpy()) changes_pred = p_after - p_prior return smooth_grad, importance_score, x_after, changes_pred
def iterative_gradient(model, row, pred_label, DEVICE, step_size, epsilon, max_iters=40): x0, segments_ids, input_masks = row x0_np = x0.cpu().numpy() x_after_np = np.copy(x0_np) # iterative perturbation x_after = x0.detach() cnt = 0 while np.linalg.norm(x_after_np - x0_np) <= epsilon and cnt <= max_iters: _, _, x_after, _ = vanilla_gradient( model, [x_after, segments_ids, input_masks], pred_label, DEVICE, step_size) x_after = x_after.clone().detach() x_after_np = x_after.cpu().numpy() cnt += 1 x_delta = x_after - x0 grad_l2 = np.sum(x_delta.cpu().numpy()[0, :, :]**2, axis=1) importance_score = normalize_score(grad_l2) pred = model(inputs_embeds=x0, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_prior = logit2prob(pred[0].cpu().data.numpy()) pred = model(inputs_embeds=x_after, token_type_ids=segments_ids, attention_mask=input_masks, labels=None)[0] p_after = logit2prob(pred[0].cpu().data.numpy()) changes_pred = p_after - p_prior # print(changes_pred) return x_delta.cpu().numpy(), importance_score, x_after, changes_pred
def evaluate_attention_removal(model, x0, pred_label, inner1, inner2, MAX_RM=4): # for attention methods, inner2 should be an all'1 vector mask = torch.ones([x0.shape[0], 1, 1]) mask.requires_grad = False model.hidden = model.init_hidden() pred, _ = model(x0.cpu(), mask) p_prior = logit2prob(pred[0].data.numpy()) pred_change = np.zeros(MAX_RM) for n in range(1, MAX_RM + 1): mask_new = remasking(mask, inner1, inner2, n) model.hidden = model.init_hidden() pred, _ = model(x0.cpu(), mask_new) p_after = logit2prob(pred[0].data.numpy()) changes_pred = p_after - p_prior pred_change[n - 1] = -changes_pred[pred_label[0]] return pred_change