def _compute_rdp(votes, baseline, threshold, sigma1, sigma2, delta, orders, data_ind): """Computes the (data-dependent) RDP curve for Confident GNMax.""" rdp_cum = np.zeros(len(orders)) rdp_sqrd_cum = np.zeros(len(orders)) answered = 0 for i, v in enumerate(votes): if threshold is None: logq_step1 = 0 # No thresholding, always proceed to step 2. rdp_step1 = np.zeros(len(orders)) else: logq_step1 = pate.compute_logpr_answered(threshold, sigma1, v - baseline[i, ]) if data_ind: rdp_step1 = pate.compute_rdp_data_independent_threshold( sigma1, orders) else: rdp_step1 = pate.compute_rdp_threshold(logq_step1, sigma1, orders) if data_ind: rdp_step2 = pate.rdp_data_independent_gaussian(sigma2, orders) else: logq_step2 = pate.compute_logq_gaussian(v, sigma2) rdp_step2 = pate.rdp_gaussian(logq_step2, sigma2, orders) q_step1 = np.exp(logq_step1) rdp = rdp_step1 + rdp_step2 * q_step1 # The expression below evaluates # E[(cost_of_step_1 + Bernoulli(pr_of_step_2) * cost_of_step_2)^2] rdp_sqrd = (rdp_step1**2 + 2 * rdp_step1 * q_step1 * rdp_step2 + q_step1 * rdp_step2**2) rdp_sqrd_cum += rdp_sqrd rdp_cum += rdp answered += q_step1 if ((i + 1) % 1000 == 0) or (i == votes.shape[0] - 1): rdp_var = rdp_sqrd_cum / i - (rdp_cum / i)**2 # Ignore Bessel's correction. eps_total, order_opt = pate.compute_eps_from_delta( orders, rdp_cum, delta) order_opt_idx = np.searchsorted(orders, order_opt) eps_std = ((i + 1) * rdp_var[order_opt_idx])**.5 # Std of the sum. print( 'queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} (std = {:.5f}) ' 'at order = {:.2f} (contribution from delta = {:.3f})'.format( i + 1, answered, eps_total, eps_std, order_opt, -math.log(delta) / (order_opt - 1))) sys.stdout.flush() _, order_opt = pate.compute_eps_from_delta(orders, rdp_cum, delta) return order_opt
def _compute_rdp(votes, baseline, threshold, sigma1, sigma2, delta, orders, data_ind): """Computes the (data-dependent) RDP curve for Confident GNMax.""" rdp_cum = np.zeros(len(orders)) rdp_sqrd_cum = np.zeros(len(orders)) answered = 0 for i, v in enumerate(votes): if threshold is None: logq_step1 = 0 # No thresholding, always proceed to step 2. rdp_step1 = np.zeros(len(orders)) else: logq_step1 = pate.compute_logpr_answered(threshold, sigma1, v - baseline[i,]) if data_ind: rdp_step1 = pate.compute_rdp_data_independent_threshold(sigma1, orders) else: rdp_step1 = pate.compute_rdp_threshold(logq_step1, sigma1, orders) if data_ind: rdp_step2 = pate.rdp_data_independent_gaussian(sigma2, orders) else: logq_step2 = pate.compute_logq_gaussian(v, sigma2) rdp_step2 = pate.rdp_gaussian(logq_step2, sigma2, orders) q_step1 = np.exp(logq_step1) rdp = rdp_step1 + rdp_step2 * q_step1 # The expression below evaluates # E[(cost_of_step_1 + Bernoulli(pr_of_step_2) * cost_of_step_2)^2] rdp_sqrd = ( rdp_step1**2 + 2 * rdp_step1 * q_step1 * rdp_step2 + q_step1 * rdp_step2**2) rdp_sqrd_cum += rdp_sqrd rdp_cum += rdp answered += q_step1 if ((i + 1) % 1000 == 0) or (i == votes.shape[0] - 1): rdp_var = rdp_sqrd_cum / i - ( rdp_cum / i)**2 # Ignore Bessel's correction. eps_total, order_opt = pate.compute_eps_from_delta(orders, rdp_cum, delta) order_opt_idx = np.searchsorted(orders, order_opt) eps_std = ((i + 1) * rdp_var[order_opt_idx])**.5 # Std of the sum. print( 'queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} (std = {:.5f}) ' 'at order = {:.2f} (contribution from delta = {:.3f})'.format( i + 1, answered, eps_total, eps_std, order_opt, -math.log(delta) / (order_opt - 1))) sys.stdout.flush() _, order_opt = pate.compute_eps_from_delta(orders, rdp_cum, delta) return order_opt
def analyze_gnmax_conf_data_ind(votes, threshold, sigma1, sigma2, delta): orders = np.logspace(np.log10(1.5), np.log10(500), num=100) n = votes.shape[0] rdp_total = np.zeros(len(orders)) answered_total = 0 answered = np.zeros(n) eps_cum = np.full(n, None, dtype=float) for i in range(n): v = votes[i, ] if threshold is not None and sigma1 is not None: q_step1 = np.exp(pate.compute_logpr_answered(threshold, sigma1, v)) rdp_total += pate.rdp_data_independent_gaussian(sigma1, orders) else: q_step1 = 1. # always answer answered_total += q_step1 answered[i] = answered_total rdp_total += q_step1 * pate.rdp_data_independent_gaussian( sigma2, orders) eps_cum[i], order_opt = pate.compute_eps_from_delta( orders, rdp_total, delta) if i > 0 and (i + 1) % 1000 == 0: print('queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} ' 'at order = {:.2f}.'.format(i + 1, answered[i], eps_cum[i], order_opt)) sys.stdout.flush() return eps_cum, answered
def analyze_gnmax_conf_data_ind(votes, threshold, sigma1, sigma2, delta): orders = np.logspace(np.log10(1.5), np.log10(500), num=100) n = votes.shape[0] rdp_total = np.zeros(len(orders)) answered_total = 0 answered = np.zeros(n) eps_cum = np.full(n, None, dtype=float) for i in range(n): v = votes[i,] if threshold is not None and sigma1 is not None: q_step1 = np.exp(pate.compute_logpr_answered(threshold, sigma1, v)) rdp_total += pate.rdp_data_independent_gaussian(sigma1, orders) else: q_step1 = 1. # always answer answered_total += q_step1 answered[i] = answered_total rdp_total += q_step1 * pate.rdp_data_independent_gaussian(sigma2, orders) eps_cum[i], order_opt = pate.compute_eps_from_delta(orders, rdp_total, delta) if i > 0 and (i + 1) % 1000 == 0: print('queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} ' 'at order = {:.2f}.'.format( i + 1, answered[i], eps_cum[i], order_opt)) sys.stdout.flush() return eps_cum, answered
def _test_compute_eps_from_delta_monotonicity(self): # Test for monotonicity with respect to delta. orders = [1.1, 2.5, 250.0] sigmas = [1e-3, 1.0, 1e5] deltas = [1e-60, 1e-6, 0.1, 0.999] for sigma in sigmas: list_of_eps = [] rdps_for_gaussian = np.array(orders) / (2 * sigma**2) for delta in deltas: list_of_eps.append( pate.compute_eps_from_delta(orders, rdps_for_gaussian, delta)[0]) # Check that in list_of_eps, epsilons are decreasing (as delta increases). sorted_list_of_eps = list(list_of_eps) sorted_list_of_eps.sort(reverse=True) self.assertEqual(list_of_eps, sorted_list_of_eps)
def run_analysis(votes, mechanism, noise_scale, params): """Computes data-dependent privacy. Args: votes: A matrix of votes, where each row contains votes in one instance. mechanism: A name of the mechanism ('lnmax', 'gnmax', or 'gnmax_conf') noise_scale: A mechanism privacy parameter. params: Other privacy parameters. Returns: Four lists: cumulative privacy cost epsilon, how privacy budget is split, how many queries were answered, optimal order. """ def compute_partition(order_opt, eps): order_opt_idx = np.searchsorted(orders, order_opt) if mechanism == 'gnmax_conf': p = (rdp_select_cum[order_opt_idx], rdp_cum[order_opt_idx] - rdp_select_cum[order_opt_idx], -math.log(delta) / (order_opt - 1)) else: p = (rdp_cum[order_opt_idx], -math.log(delta) / (order_opt - 1)) return [x / eps for x in p] # Ensures that sum(x) == 1 # Short list of orders. # orders = np.round(np.concatenate((np.arange(2, 50 + 1, 1), # np.logspace(np.log10(50), np.log10(1000), num=20)))) # Long list of orders. orders = np.concatenate((np.arange(2, 100 + 1, .5), np.logspace(np.log10(100), np.log10(500), num=100))) delta = 1e-8 n = votes.shape[0] eps_total = np.zeros(n) partition = [None] * n order_opt = np.full(n, np.nan, dtype=float) answered = np.zeros(n, dtype=float) rdp_cum = np.zeros(len(orders)) rdp_sqrd_cum = np.zeros(len(orders)) rdp_select_cum = np.zeros(len(orders)) answered_sum = 0 for i in range(n): v = votes[i,] if mechanism == 'lnmax': logq_lnmax = pate.compute_logq_laplace(v, noise_scale) rdp_query = pate.rdp_pure_eps(logq_lnmax, 2. / noise_scale, orders) rdp_sqrd = rdp_query ** 2 pr_answered = 1 elif mechanism == 'gnmax': logq_gmax = pate.compute_logq_gaussian(v, noise_scale) rdp_query = pate.rdp_gaussian(logq_gmax, noise_scale, orders) rdp_sqrd = rdp_query ** 2 pr_answered = 1 elif mechanism == 'gnmax_conf': logq_step1 = pate.compute_logpr_answered(params['t'], params['sigma1'], v) logq_step2 = pate.compute_logq_gaussian(v, noise_scale) q_step1 = np.exp(logq_step1) logq_step1_min = min(logq_step1, math.log1p(-q_step1)) rdp_gnmax_step1 = pate.rdp_gaussian(logq_step1_min, 2 ** .5 * params['sigma1'], orders) rdp_gnmax_step2 = pate.rdp_gaussian(logq_step2, noise_scale, orders) rdp_query = rdp_gnmax_step1 + q_step1 * rdp_gnmax_step2 # The expression below evaluates # E[(cost_of_step_1 + Bernoulli(pr_of_step_2) * cost_of_step_2)^2] rdp_sqrd = ( rdp_gnmax_step1 ** 2 + 2 * rdp_gnmax_step1 * q_step1 * rdp_gnmax_step2 + q_step1 * rdp_gnmax_step2 ** 2) rdp_select_cum += rdp_gnmax_step1 pr_answered = q_step1 else: raise ValueError( 'Mechanism must be one of ["lnmax", "gnmax", "gnmax_conf"]') rdp_cum += rdp_query rdp_sqrd_cum += rdp_sqrd answered_sum += pr_answered answered[i] = answered_sum eps_total[i], order_opt[i] = pate.compute_eps_from_delta( orders, rdp_cum, delta) partition[i] = compute_partition(order_opt[i], eps_total[i]) if i > 0 and (i + 1) % 1000 == 0: rdp_var = rdp_sqrd_cum / i - ( rdp_cum / i) ** 2 # Ignore Bessel's correction. order_opt_idx = np.searchsorted(orders, order_opt[i]) eps_std = ((i + 1) * rdp_var[order_opt_idx]) ** .5 # Std of the sum. print( 'queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} (std = {:.5f}) ' 'at order = {:.2f} (contribution from delta = {:.3f})'.format( i + 1, answered_sum, eps_total[i], eps_std, order_opt[i], -math.log(delta) / (order_opt[i] - 1))) sys.stdout.flush() return eps_total, partition, answered, order_opt
def analyze_gnmax_conf_data_dep(votes, threshold, sigma1, sigma2, delta): # Short list of orders. # orders = np.round(np.logspace(np.log10(20), np.log10(200), num=20)) # Long list of orders. orders = np.concatenate((np.arange(20, 40, .2), np.arange(40, 75, .5), np.logspace(np.log10(75), np.log10(200), num=20))) n = votes.shape[0] num_classes = votes.shape[1] num_teachers = int(sum(votes[0, ])) if threshold is not None and sigma1 is not None: is_data_ind_step1 = pate.is_data_independent_always_opt_gaussian( num_teachers, num_classes, sigma1, orders) else: is_data_ind_step1 = [True] * len(orders) is_data_ind_step2 = pate.is_data_independent_always_opt_gaussian( num_teachers, num_classes, sigma2, orders) eps_partitioned = np.full(n, None, dtype=Partition) order_opt = np.full(n, None, dtype=float) ss_std_opt = np.full(n, None, dtype=float) answered = np.zeros(n) rdp_step1_total = np.zeros(len(orders)) rdp_step2_total = np.zeros(len(orders)) ls_total = np.zeros((len(orders), num_teachers)) answered_total = 0 for i in range(n): v = votes[i, ] if threshold is not None and sigma1 is not None: logq_step1 = pate.compute_logpr_answered(threshold, sigma1, v) rdp_step1_total += pate.compute_rdp_threshold( logq_step1, sigma1, orders) else: logq_step1 = 0. # always answer pr_answered = np.exp(logq_step1) logq_step2 = pate.compute_logq_gaussian(v, sigma2) rdp_step2_total += pr_answered * pate.rdp_gaussian( logq_step2, sigma2, orders) answered_total += pr_answered rdp_ss = np.zeros(len(orders)) ss_std = np.zeros(len(orders)) for j, order in enumerate(orders): if not is_data_ind_step1[j]: ls_step1 = pate_ss.compute_local_sensitivity_bounds_threshold( v, num_teachers, threshold, sigma1, order) else: ls_step1 = np.full(num_teachers, 0, dtype=float) if not is_data_ind_step2[j]: ls_step2 = pate_ss.compute_local_sensitivity_bounds_gnmax( v, num_teachers, sigma2, order) else: ls_step2 = np.full(num_teachers, 0, dtype=float) ls_total[j, ] += ls_step1 + pr_answered * ls_step2 beta_ss = .49 / order ss = pate_ss.compute_discounted_max(beta_ss, ls_total[j, ]) sigma_ss = ((order * math.exp(2 * beta_ss)) / ss)**(1 / 3) rdp_ss[j] = pate_ss.compute_rdp_of_smooth_sensitivity_gaussian( beta_ss, sigma_ss, order) ss_std[j] = ss * sigma_ss rdp_total = rdp_step1_total + rdp_step2_total + rdp_ss answered[i] = answered_total _, order_opt[i] = pate.compute_eps_from_delta(orders, rdp_total, delta) order_idx = np.searchsorted(orders, order_opt[i]) # Since optimal orders are always non-increasing, shrink orders array # and all cumulative arrays to speed up computation. if order_idx < len(orders): orders = orders[:order_idx + 1] rdp_step1_total = rdp_step1_total[:order_idx + 1] rdp_step2_total = rdp_step2_total[:order_idx + 1] eps_partitioned[i] = Partition(step1=rdp_step1_total[order_idx], step2=rdp_step2_total[order_idx], ss=rdp_ss[order_idx], delta=-math.log(delta) / (order_opt[i] - 1)) ss_std_opt[i] = ss_std[order_idx] if i > 0 and (i + 1) % 1 == 0: print( 'queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} +/- {:.3f} ' 'at order = {:.2f}. Contributions: delta = {:.3f}, step1 = {:.3f}, ' 'step2 = {:.3f}, ss = {:.3f}'.format( i + 1, answered[i], sum(eps_partitioned[i]), ss_std_opt[i], order_opt[i], eps_partitioned[i].delta, eps_partitioned[i].step1, eps_partitioned[i].step2, eps_partitioned[i].ss)) sys.stdout.flush() return eps_partitioned, answered, ss_std_opt, order_opt
def analyze_gnmax_conf_data_dep(votes, threshold, sigma1, sigma2, delta): # Short list of orders. # orders = np.round(np.logspace(np.log10(20), np.log10(200), num=20)) # Long list of orders. orders = np.concatenate((np.arange(20, 40, .2), np.arange(40, 75, .5), np.logspace(np.log10(75), np.log10(200), num=20))) n = votes.shape[0] num_classes = votes.shape[1] num_teachers = int(sum(votes[0,])) if threshold is not None and sigma1 is not None: is_data_ind_step1 = pate.is_data_independent_always_opt_gaussian( num_teachers, num_classes, sigma1, orders) else: is_data_ind_step1 = [True] * len(orders) is_data_ind_step2 = pate.is_data_independent_always_opt_gaussian( num_teachers, num_classes, sigma2, orders) eps_partitioned = np.full(n, None, dtype=Partition) order_opt = np.full(n, None, dtype=float) ss_std_opt = np.full(n, None, dtype=float) answered = np.zeros(n) rdp_step1_total = np.zeros(len(orders)) rdp_step2_total = np.zeros(len(orders)) ls_total = np.zeros((len(orders), num_teachers)) answered_total = 0 for i in range(n): v = votes[i,] if threshold is not None and sigma1 is not None: logq_step1 = pate.compute_logpr_answered(threshold, sigma1, v) rdp_step1_total += pate.compute_rdp_threshold(logq_step1, sigma1, orders) else: logq_step1 = 0. # always answer pr_answered = np.exp(logq_step1) logq_step2 = pate.compute_logq_gaussian(v, sigma2) rdp_step2_total += pr_answered * pate.rdp_gaussian(logq_step2, sigma2, orders) answered_total += pr_answered rdp_ss = np.zeros(len(orders)) ss_std = np.zeros(len(orders)) for j, order in enumerate(orders): if not is_data_ind_step1[j]: ls_step1 = pate_ss.compute_local_sensitivity_bounds_threshold(v, num_teachers, threshold, sigma1, order) else: ls_step1 = np.full(num_teachers, 0, dtype=float) if not is_data_ind_step2[j]: ls_step2 = pate_ss.compute_local_sensitivity_bounds_gnmax( v, num_teachers, sigma2, order) else: ls_step2 = np.full(num_teachers, 0, dtype=float) ls_total[j,] += ls_step1 + pr_answered * ls_step2 beta_ss = .49 / order ss = pate_ss.compute_discounted_max(beta_ss, ls_total[j,]) sigma_ss = ((order * math.exp(2 * beta_ss)) / ss) ** (1 / 3) rdp_ss[j] = pate_ss.compute_rdp_of_smooth_sensitivity_gaussian( beta_ss, sigma_ss, order) ss_std[j] = ss * sigma_ss rdp_total = rdp_step1_total + rdp_step2_total + rdp_ss answered[i] = answered_total _, order_opt[i] = pate.compute_eps_from_delta(orders, rdp_total, delta) order_idx = np.searchsorted(orders, order_opt[i]) # Since optimal orders are always non-increasing, shrink orders array # and all cumulative arrays to speed up computation. if order_idx < len(orders): orders = orders[:order_idx + 1] rdp_step1_total = rdp_step1_total[:order_idx + 1] rdp_step2_total = rdp_step2_total[:order_idx + 1] eps_partitioned[i] = Partition(step1=rdp_step1_total[order_idx], step2=rdp_step2_total[order_idx], ss=rdp_ss[order_idx], delta=-math.log(delta) / (order_opt[i] - 1)) ss_std_opt[i] = ss_std[order_idx] if i > 0 and (i + 1) % 1 == 0: print('queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} +/- {:.3f} ' 'at order = {:.2f}. Contributions: delta = {:.3f}, step1 = {:.3f}, ' 'step2 = {:.3f}, ss = {:.3f}'.format( i + 1, answered[i], sum(eps_partitioned[i]), ss_std_opt[i], order_opt[i], eps_partitioned[i].delta, eps_partitioned[i].step1, eps_partitioned[i].step2, eps_partitioned[i].ss)) sys.stdout.flush() return eps_partitioned, answered, ss_std_opt, order_opt
def _test_compute_eps_from_delta_value_error(self): # Test for ValueError. with self.assertRaises(ValueError): pate.compute_eps_from_delta([1.1, 2, 3, 4], [1, 2, 3], 0.001)