def kkt_attack(two_class_kkt, target_grad, target_theta, total_epsilon, epsilon_pos, epsilon_neg, X_train, Y_train, class_map, centroids, centroid_vec, sphere_radii, slab_radii, target_bias, target_bias_grad, max_losses, sv_centroids=None, sv_sphere_radii=None): x_pos, x_neg, epsilon_pos, epsilon_neg = two_class_kkt.solve( target_grad, target_theta, epsilon_pos, epsilon_neg, class_map, centroids, centroid_vec, sphere_radii, slab_radii, target_bias=target_bias, target_bias_grad=target_bias_grad, max_losses=max_losses, verbose=False) obj = np.linalg.norm(target_grad - epsilon_pos * x_pos.reshape(-1) + epsilon_neg * x_neg.reshape(-1)) print("** Actual objective value: %.4f" % obj) num_train = X_train.shape[0] total_points_to_add = int(np.round(total_epsilon * X_train.shape[0])) num_pos = int(np.round(epsilon_pos * X_train.shape[0])) num_neg = total_points_to_add - num_pos assert num_neg >= 0 X_modified, Y_modified = data.add_points(x_pos, 1, X_train, Y_train, num_copies=num_pos) X_modified, Y_modified = data.add_points(x_neg, -1, X_modified, Y_modified, num_copies=num_neg) return X_modified, Y_modified, obj, x_pos, x_neg, num_pos, num_neg
def init_gradient_attack_from_mask(X_train, Y_train, epsilon, feasible_flipped_mask, general_train_idx, sensitive_file, attack_method, use_copy=True): DATA_FOLDER = './data' dataset_path = os.path.join(DATA_FOLDER) f = np.load(os.path.join(dataset_path, sensitive_file)) group_label = f['group_label'] advantaged = 1 male_train_index = np.where( group_label[0:general_train_idx] == 0)[0].astype(np.int32) female_train_index = np.where( group_label[0:general_train_idx] == 1)[0].astype(np.int32) male_test_index = np.where(group_label[general_train_idx:] == 0)[0].astype( np.int32) female_test_index = np.where( group_label[general_train_idx:] == 1)[0].astype(np.int32) index_male_true_train = np.where( np.logical_and(group_label[0:general_train_idx] == 0, Y_train == 1))[0].astype(np.int32) index_female_true_train = np.where( np.logical_and(group_label[0:general_train_idx] == 1, Y_train == 1))[0].astype(np.int32) train_data_one_female_prob = group_label[0:general_train_idx][ index_female_true_train].shape[0] / female_train_index.shape[0] train_data_one_male_prob = group_label[0:general_train_idx][ index_male_true_train].shape[0] / male_train_index.shape[0] gender_labels = np.zeros(general_train_idx) for k in range(general_train_idx): if (k in male_train_index): gender_labels[k] = 1 elif (k in female_train_index): gender_labels[k] = -1 if not use_copy: num_copies = int(np.round(epsilon * X_train.shape[0])) idx_to_copy = np.random.choice(np.where(feasible_flipped_mask)[0], size=num_copies, replace=True) X_modified = data.vstack(X_train, X_train[idx_to_copy, :]) Y_modified = np.append(Y_train, -Y_train[idx_to_copy]) copy_array = None indices_to_poison = np.arange(X_train.shape[0], X_modified.shape[0]) else: num_copies = int(np.round(epsilon * X_train.shape[0])) # Choose this in inverse class balance num_pos_copies = int(np.round(np.mean(Y_train == -1) * num_copies)) num_neg_copies = num_copies - num_pos_copies np.random.seed(0) if (train_data_one_female_prob > train_data_one_male_prob): advantaged = -1 pos_idx_to_copy = np.random.choice( np.where(feasible_flipped_mask & (Y_train == 1) & (gender_labels == -1))[0]) neg_idx_to_copy = np.random.choice( np.where(feasible_flipped_mask & (Y_train == -1) & (gender_labels == 1))[0]) else: advantaged = 1 pos_idx_to_copy = np.random.choice( np.where(feasible_flipped_mask & (Y_train == 1) & (gender_labels == 1))[0]) neg_idx_to_copy = np.random.choice( np.where(feasible_flipped_mask & (Y_train == -1) & (gender_labels == -1))[0]) if (neg_idx_to_copy in female_train_index): print("female") else: print("male") if (pos_idx_to_copy in female_train_index): print("female") else: print("male") print(neg_idx_to_copy) print(pos_idx_to_copy) #exit() num_pos_copies -= 1 num_neg_copies -= 1 X_modified, Y_modified = data.add_points(X_train[pos_idx_to_copy, :], 1, X_train, Y_train, num_copies=1) X_modified, Y_modified = data.add_points(X_train[neg_idx_to_copy, :], -1, X_modified, Y_modified, num_copies=1) X_modified, Y_modified = data.add_points(X_train[pos_idx_to_copy, :], 1, X_modified, Y_modified, num_copies=num_pos_copies) X_modified, Y_modified = data.add_points(X_train[neg_idx_to_copy, :], -1, X_modified, Y_modified, num_copies=num_neg_copies) copy_array = [num_pos_copies, num_neg_copies] indices_to_poison = np.arange(X_train.shape[0], X_train.shape[0] + 2) return X_modified, Y_modified, indices_to_poison, copy_array, advantaged
X_flipped, Y_flipped) loss_diffs = poisoned_losses - orig_losses q = q_finder.solve(loss_diffs, verbose=True) print("At iteration %s, q is:" % iter_idx) print(q) if np.all(old_q == q): print('Done, terminating') break q_idx = np.where(q)[0][0] assert q[q_idx] == num_points_to_add if sparse.issparse(X_flipped): x = X_flipped[q_idx, :].toarray() else: x = X_flipped[q_idx, :] X_modified, Y_modified = data.add_points(x, Y_flipped[q_idx], X_train, Y_train, num_copies=num_points_to_add) attack_save_path = datasets.get_target_attack_npz_path( dataset_name, epsilon, weight_decay, percentile, attack_label) if sparse.issparse(X_modified): X_poison = X_modified[n:, :].asfptype() else: X_poison = X_modified[n:, :] np.savez(attack_save_path, X_poison=X_poison, Y_poison=Y_modified[n:])
def init_gradient_attack_from_mask( X_train, Y_train, epsilon, feasible_flipped_mask, use_copy=True): if not use_copy: num_copies = int(np.round(epsilon * X_train.shape[0])) idx_to_copy = np.random.choice( np.where(feasible_flipped_mask)[0], size=num_copies, replace=True) X_modified = data.vstack(X_train, X_train[idx_to_copy, :]) Y_modified = np.append(Y_train, -Y_train[idx_to_copy]) copy_array = None indices_to_poison = np.arange(X_train.shape[0], X_modified.shape[0]) else: num_copies = int(np.round(epsilon * X_train.shape[0])) # Choose this in inverse class balance num_pos_copies = int(np.round(np.mean(Y_train == -1) * num_copies)) num_neg_copies = num_copies - num_pos_copies np.random.seed(0) pos_idx_to_copy = np.random.choice( np.where(feasible_flipped_mask & (Y_train == -1))[0]) neg_idx_to_copy = np.random.choice( np.where(feasible_flipped_mask & (Y_train == 1))[0]) num_pos_copies -= 1 num_neg_copies -= 1 X_modified, Y_modified = data.add_points( X_train[pos_idx_to_copy, :], 1, X_train, Y_train, num_copies=1) X_modified, Y_modified = data.add_points( X_train[neg_idx_to_copy, :], -1, X_modified, Y_modified, num_copies=1) X_modified, Y_modified = data.add_points( X_train[pos_idx_to_copy, :], 1, X_modified, Y_modified, num_copies=num_pos_copies) X_modified, Y_modified = data.add_points( X_train[neg_idx_to_copy, :], -1, X_modified, Y_modified, num_copies=num_neg_copies) copy_array = [num_pos_copies, num_neg_copies] indices_to_poison = np.arange(X_train.shape[0], X_train.shape[0]+2) return X_modified, Y_modified, indices_to_poison, copy_array
def kkt_for_lr(d, args, target_grad, theta_p, bias_p, total_eps, eps_pos, eps_neg, X_train, Y_train, x_pos_tuple=None, x_neg_tuple=None, lr=1e-5, num_steps=3000, trials=10, optimizer='adam'): # we did not implement defenses for KKT for logistic regression x_min_pos, x_max_pos = x_pos_tuple x_min_neg, x_max_neg = x_neg_tuple best_obj = 1e10 for trial in range(trials): # print("------ trial {}------".format(trial)) # optimization variables if args.dataset == 'dogfish': x_pos = np.array([ upper_bounds.random_sample(x_min_pos[i], x_max_pos[i]) for i in range(len(x_min_pos)) ]) x_neg = np.array([ upper_bounds.random_sample(x_min_neg[i], x_max_neg[i]) for i in range(len(x_min_neg)) ]) else: x_pos = np.array([ upper_bounds.random_sample(x_min_pos, x_max_pos) for i in range(d) ]) x_neg = np.array([ upper_bounds.random_sample(x_min_neg, x_max_neg) for i in range(d) ]) if optimizer == 'adagrad': # store the square of gradients grads_squared_pos = np.zeros(d) grads_squared_neg = np.zeros(d) initial_accumulator_value = 0.001 grads_squared_pos.fill(initial_accumulator_value) grads_squared_neg.fill(initial_accumulator_value) epsilon = 1e-7 elif optimizer == 'adam': grads_first_moment_pos = np.zeros(d) grads_second_moment_pos = np.zeros(d) grads_first_moment_neg = np.zeros(d) grads_second_moment_neg = np.zeros(d) beta1 = 0.9 beta2 = 0.999 epsilon = 1e-8 prev_obj = 1e10 for step in range(num_steps): score_pos = np.dot(theta_p, x_pos) + bias_p score_neg = np.dot(theta_p, x_neg) + bias_p # sigmoid prediction confidence prediction_pos = upper_bounds.sigmoid(score_pos) prediction_neg = upper_bounds.sigmoid(score_neg) # output_error_signal_pos = 1 - prediction_pos # this is also the gradient of b for positive x part # output_error_signal_neg = -1 - prediction_neg # this is also the gradient of b for negative x part # the objective value of KKT attack is the norm of following vector kkt_obj_grad = target_grad + eps_pos * ( 1 - prediction_pos) * x_pos + eps_neg * ( -prediction_neg ) * x_neg # note that, we use negative label as 0, not -1 kkt_obj = np.linalg.norm(kkt_obj_grad)**2 if step == 0: print("(random) initial obj value:", kkt_obj) # constant values for x_pos and x_neg grad_pos = 2 * eps_pos * (1 - prediction_pos) * kkt_obj_grad grad_neg = 2 * eps_neg * ( -prediction_neg ) * kkt_obj_grad # note that, we use negative label as 0, not -1 if optimizer == 'gd': x_pos -= lr * grad_pos x_neg -= lr * grad_neg elif optimizer == 'adagrad': """Weights update using adagrad. grads2 = grads2 + grads**2 w' = w - lr * grads / (sqrt(grads2) + epsilon) """ # update x_pos grads_squared_pos = grads_squared_pos + grad_pos**2 x_pos = x_pos - lr * grad_pos / (np.sqrt(grads_squared_pos) + epsilon) # update x_neg grads_squared_neg = grads_squared_neg + grad_neg**2 x_neg = x_neg - lr * grad_neg / (np.sqrt(grads_squared_neg) + epsilon) elif optimizer == 'adam': """Weights update using Adam. g1 = beta1 * g1 + (1 - beta1) * grads g2 = beta2 * g2 + (1 - beta2) * g2 g1_unbiased = g1 / (1 - beta1**time) g2_unbiased = g2 / (1 - beta2**time) w = w - lr * g1_unbiased / (sqrt(g2_unbiased) + epsilon) """ time = step + 1 # update x_pos grads_first_moment_pos = beta1 * grads_first_moment_pos + \ (1. - beta1) * grad_pos grads_second_moment_pos = beta2 * grads_second_moment_pos + \ (1. - beta2) * grad_pos**2 grads_first_moment_unbiased_pos = grads_first_moment_pos / ( 1. - beta1**time) grads_second_moment_unbiased_pos = grads_second_moment_pos / ( 1. - beta2**time) x_pos = x_pos - lr * grads_first_moment_unbiased_pos / ( np.sqrt(grads_second_moment_unbiased_pos) + epsilon) # update x_neg grads_first_moment_neg = beta1 * grads_first_moment_neg + \ (1. - beta1) * grad_neg grads_second_moment_neg = beta2 * grads_second_moment_neg + \ (1. - beta2) * grad_neg**2 grads_first_moment_unbiased_neg = grads_first_moment_neg / ( 1. - beta1**time) grads_second_moment_unbiased_neg = grads_second_moment_neg / ( 1. - beta2**time) x_neg = x_neg - lr * grads_first_moment_unbiased_neg / ( np.sqrt(grads_second_moment_unbiased_neg) + epsilon) # print(y_tmp,output_error_signal_c, output_error_signal_p) # projection step to ensure it is within bounded norm x_pos = np.clip(x_pos, x_min_pos, x_max_pos) x_neg = np.clip(x_neg, x_min_neg, x_max_neg) # print("added: min max",np.amin(lr * (gradient_c - gradient_p)),np.amax(lr * (gradient_c - gradient_p))) # print("before: min max",np.amin(x),np.amax(x)) # objective function value found so far (minimization) kkt_obj_grad = target_grad + eps_pos * ( 1 - prediction_pos) * x_pos + eps_neg * ( -prediction_neg ) * x_neg # again, negative label is 0, not -1 kkt_obj = np.linalg.norm(kkt_obj_grad)**2 if best_obj > kkt_obj: best_obj = kkt_obj best_x_pos = x_pos best_x_neg = x_neg if np.abs(prev_obj - kkt_obj) < 1e-7: print("Enough convergence") print( "steps: {} current norm (objective): {:.4f} minimum norm: {:.4f}" .format(step + 1, kkt_obj, best_obj)) break prev_obj = kkt_obj # # Print log-likelihood every so often # if (step+1) % 2000 == 0: # print("current obj:",kkt_obj) print("** Actual objective value: %.4f" % best_obj) # num_train = X_train.shape[0] total_points_to_add = int(np.round(total_eps * X_train.shape[0])) num_pos = int(np.round(eps_pos * X_train.shape[0])) num_neg = total_points_to_add - num_pos assert num_neg >= 0 X_modified, Y_modified = data.add_points(best_x_pos, 1, X_train, Y_train, num_copies=num_pos) X_modified, Y_modified = data.add_points(best_x_neg, -1, X_modified, Y_modified, num_copies=num_neg) return X_modified, Y_modified, best_obj, best_x_pos, best_x_neg, num_pos, num_neg