Пример #1
0
def analyze_gnmax_conf_data_ind(votes, threshold, sigma1, sigma2, delta):
  orders = np.logspace(np.log10(1.5), np.log10(500), num=100)
  n = votes.shape[0]

  rdp_total = np.zeros(len(orders))
  answered_total = 0
  answered = np.zeros(n)
  eps_cum = np.full(n, None, dtype=float)

  for i in range(n):
    v = votes[i,]
    if threshold is not None and sigma1 is not None:
      q_step1 = np.exp(pate.compute_logpr_answered(threshold, sigma1, v))
      rdp_total += pate.rdp_data_independent_gaussian(sigma1, orders)
    else:
      q_step1 = 1.  # always answer

    answered_total += q_step1
    answered[i] = answered_total

    rdp_total += q_step1 * pate.rdp_data_independent_gaussian(sigma2, orders)

    eps_cum[i], order_opt = pate.compute_eps_from_delta(orders, rdp_total,
                                                        delta)

    if i > 0 and (i + 1) % 1000 == 0:
      print('queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} '
            'at order = {:.2f}.'.format(
          i + 1,
          answered[i],
          eps_cum[i],
          order_opt))
      sys.stdout.flush()

  return eps_cum, answered
Пример #2
0
def analyze_gnmax_conf_data_ind(votes, threshold, sigma1, sigma2, delta):
    orders = np.logspace(np.log10(1.5), np.log10(500), num=100)
    n = votes.shape[0]

    rdp_total = np.zeros(len(orders))
    answered_total = 0
    answered = np.zeros(n)
    eps_cum = np.full(n, None, dtype=float)

    for i in range(n):
        v = votes[i, ]
        if threshold is not None and sigma1 is not None:
            q_step1 = np.exp(pate.compute_logpr_answered(threshold, sigma1, v))
            rdp_total += pate.rdp_data_independent_gaussian(sigma1, orders)
        else:
            q_step1 = 1.  # always answer

        answered_total += q_step1
        answered[i] = answered_total

        rdp_total += q_step1 * pate.rdp_data_independent_gaussian(
            sigma2, orders)

        eps_cum[i], order_opt = pate.compute_eps_from_delta(
            orders, rdp_total, delta)

        if i > 0 and (i + 1) % 1000 == 0:
            print('queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} '
                  'at order = {:.2f}.'.format(i + 1, answered[i], eps_cum[i],
                                              order_opt))
            sys.stdout.flush()

    return eps_cum, answered
Пример #3
0
def plot_rdp_curve_per_example(votes, sigmas):
  orders = np.linspace(1., 100., endpoint=True, num=1000)
  orders[0] = 1.001
  fig, ax = setup_plot()

  for i in range(votes.shape[0]):
    for sigma in sigmas:
      logq = pate.compute_logq_gaussian(votes[i,], sigma)
      rdp = pate.rdp_gaussian(logq, sigma, orders)
      ax.plot(
          orders,
          rdp,
          alpha=1.,
          label=r'Data-dependent bound, $\sigma$={}'.format(int(sigma)),
          linewidth=5)

  for sigma in sigmas:
    ax.plot(
        orders,
        pate.rdp_data_independent_gaussian(sigma, orders),
        alpha=.3,
        label=r'Data-independent bound, $\sigma$={}'.format(int(sigma)),
        linewidth=10)

  plt.xlim(xmin=1, xmax=100)
  plt.ylim(ymin=0)
  plt.xticks([1, 20, 40, 60, 80, 100])
  plt.yticks([0, .0025, .005, .0075, .01])
  plt.xlabel(r'Order $\alpha$', fontsize=16)
  plt.ylabel(r'RDP value $\varepsilon$ at $\alpha$', fontsize=16)
  ax.tick_params(labelsize=14)

  plt.legend(loc=0, fontsize=13)
  plt.show()
Пример #4
0
def plot_rdp_curve_per_example(votes, sigmas):
  orders = np.linspace(1., 100., endpoint=True, num=1000)
  orders[0] = 1.001
  fig, ax = setup_plot()

  for i in range(votes.shape[0]):
    for sigma in sigmas:
      logq = pate.compute_logq_gaussian(votes[i,], sigma)
      rdp = pate.rdp_gaussian(logq, sigma, orders)
      ax.plot(
          orders,
          rdp,
          alpha=1.,
          label=r'Data-dependent bound, $\sigma$={}'.format(int(sigma)),
          linewidth=5)

  for sigma in sigmas:
    ax.plot(
        orders,
        pate.rdp_data_independent_gaussian(sigma, orders),
        alpha=.3,
        label=r'Data-independent bound, $\sigma$={}'.format(int(sigma)),
        linewidth=10)

  plt.xlim(xmin=1, xmax=100)
  plt.ylim(ymin=0)
  plt.xticks([1, 20, 40, 60, 80, 100])
  plt.yticks([0, .0025, .005, .0075, .01])
  plt.xlabel(r'Order $\alpha$', fontsize=16)
  plt.ylabel(r'RDP value $\varepsilon$ at $\alpha$', fontsize=16)
  ax.tick_params(labelsize=14)

  plt.legend(loc=0, fontsize=13)
  plt.show()
Пример #5
0
def _compute_rdp(votes, baseline, threshold, sigma1, sigma2, delta, orders,
                 data_ind):
    """Computes the (data-dependent) RDP curve for Confident GNMax."""
    rdp_cum = np.zeros(len(orders))
    rdp_sqrd_cum = np.zeros(len(orders))
    answered = 0

    for i, v in enumerate(votes):
        if threshold is None:
            logq_step1 = 0  # No thresholding, always proceed to step 2.
            rdp_step1 = np.zeros(len(orders))
        else:
            logq_step1 = pate.compute_logpr_answered(threshold, sigma1,
                                                     v - baseline[i, ])
            if data_ind:
                rdp_step1 = pate.compute_rdp_data_independent_threshold(
                    sigma1, orders)
            else:
                rdp_step1 = pate.compute_rdp_threshold(logq_step1, sigma1,
                                                       orders)

        if data_ind:
            rdp_step2 = pate.rdp_data_independent_gaussian(sigma2, orders)
        else:
            logq_step2 = pate.compute_logq_gaussian(v, sigma2)
            rdp_step2 = pate.rdp_gaussian(logq_step2, sigma2, orders)

        q_step1 = np.exp(logq_step1)
        rdp = rdp_step1 + rdp_step2 * q_step1
        # The expression below evaluates
        #     E[(cost_of_step_1 + Bernoulli(pr_of_step_2) * cost_of_step_2)^2]
        rdp_sqrd = (rdp_step1**2 + 2 * rdp_step1 * q_step1 * rdp_step2 +
                    q_step1 * rdp_step2**2)
        rdp_sqrd_cum += rdp_sqrd

        rdp_cum += rdp
        answered += q_step1
        if ((i + 1) % 1000 == 0) or (i == votes.shape[0] - 1):
            rdp_var = rdp_sqrd_cum / i - (rdp_cum /
                                          i)**2  # Ignore Bessel's correction.
            eps_total, order_opt = pate.compute_eps_from_delta(
                orders, rdp_cum, delta)
            order_opt_idx = np.searchsorted(orders, order_opt)
            eps_std = ((i + 1) * rdp_var[order_opt_idx])**.5  # Std of the sum.
            print(
                'queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} (std = {:.5f}) '
                'at order = {:.2f} (contribution from delta = {:.3f})'.format(
                    i + 1, answered, eps_total, eps_std, order_opt,
                    -math.log(delta) / (order_opt - 1)))
            sys.stdout.flush()

        _, order_opt = pate.compute_eps_from_delta(orders, rdp_cum, delta)

    return order_opt
Пример #6
0
def _compute_rdp(votes, baseline, threshold, sigma1, sigma2, delta, orders,
                 data_ind):
  """Computes the (data-dependent) RDP curve for Confident GNMax."""
  rdp_cum = np.zeros(len(orders))
  rdp_sqrd_cum = np.zeros(len(orders))
  answered = 0

  for i, v in enumerate(votes):
    if threshold is None:
      logq_step1 = 0  # No thresholding, always proceed to step 2.
      rdp_step1 = np.zeros(len(orders))
    else:
      logq_step1 = pate.compute_logpr_answered(threshold, sigma1,
                                               v - baseline[i,])
      if data_ind:
        rdp_step1 = pate.compute_rdp_data_independent_threshold(sigma1, orders)
      else:
        rdp_step1 = pate.compute_rdp_threshold(logq_step1, sigma1, orders)

    if data_ind:
      rdp_step2 = pate.rdp_data_independent_gaussian(sigma2, orders)
    else:
      logq_step2 = pate.compute_logq_gaussian(v, sigma2)
      rdp_step2 = pate.rdp_gaussian(logq_step2, sigma2, orders)

    q_step1 = np.exp(logq_step1)
    rdp = rdp_step1 + rdp_step2 * q_step1
    # The expression below evaluates
    #     E[(cost_of_step_1 + Bernoulli(pr_of_step_2) * cost_of_step_2)^2]
    rdp_sqrd = (
        rdp_step1**2 + 2 * rdp_step1 * q_step1 * rdp_step2 +
        q_step1 * rdp_step2**2)
    rdp_sqrd_cum += rdp_sqrd

    rdp_cum += rdp
    answered += q_step1
    if ((i + 1) % 1000 == 0) or (i == votes.shape[0] - 1):
      rdp_var = rdp_sqrd_cum / i - (
          rdp_cum / i)**2  # Ignore Bessel's correction.
      eps_total, order_opt = pate.compute_eps_from_delta(orders, rdp_cum, delta)
      order_opt_idx = np.searchsorted(orders, order_opt)
      eps_std = ((i + 1) * rdp_var[order_opt_idx])**.5  # Std of the sum.
      print(
          'queries = {}, E[answered] = {:.2f}, E[eps] = {:.3f} (std = {:.5f}) '
          'at order = {:.2f} (contribution from delta = {:.3f})'.format(
              i + 1, answered, eps_total, eps_std, order_opt,
              -math.log(delta) / (order_opt - 1)))
      sys.stdout.flush()

    _, order_opt = pate.compute_eps_from_delta(orders, rdp_cum, delta)

  return order_opt
Пример #7
0
def plot_data_ind_curve():
  fig, ax = setup_plot()

  orders = np.linspace(1., 10., endpoint=True, num=1000)
  orders[0] = 1.01

  ax.plot(
      orders,
      pate.rdp_data_independent_gaussian(1., orders),
      alpha=.5,
      color='gray',
      linewidth=10)

  # plt.yticks([])
  plt.xlim(xmin=1, xmax=10)
  plt.ylim(ymin=0)
  plt.xticks([1, 3, 5, 7, 9])
  ax.tick_params(labelsize=14)
  plt.show()
Пример #8
0
def plot_data_ind_curve():
  fig, ax = setup_plot()

  orders = np.linspace(1., 10., endpoint=True, num=1000)
  orders[0] = 1.01

  ax.plot(
      orders,
      pate.rdp_data_independent_gaussian(1., orders),
      alpha=.5,
      color='gray',
      linewidth=10)

  # plt.yticks([])
  plt.xlim(xmin=1, xmax=10)
  plt.ylim(ymin=0)
  plt.xticks([1, 3, 5, 7, 9])
  ax.tick_params(labelsize=14)
  plt.show()
Пример #9
0
def plot_rdp_curve_per_example(votes, sigmas):
    orders = np.linspace(1., 100., endpoint=True, num=1000)
    orders[0] = 1.5

    fig, ax = plt.subplots()
    fig.set_figheight(4.5)
    fig.set_figwidth(4.7)

    styles = [':', '-']
    labels = ['ex1', 'ex2']

    for i in xrange(votes.shape[0]):
        print(sorted(votes[i, ], reverse=True)[:10])
        for sigma in sigmas:
            logq = pate.compute_logq_gaussian(votes[i, ], sigma)
            rdp = pate.rdp_gaussian(logq, sigma, orders)
            ax.plot(orders,
                    rdp,
                    label=r'{} $\sigma$={}'.format(labels[i], int(sigma)),
                    linestyle=styles[i],
                    linewidth=5)

    for sigma in sigmas:
        ax.plot(orders,
                pate.rdp_data_independent_gaussian(sigma, orders),
                alpha=.3,
                label=r'Data-ind bound $\sigma$={}'.format(int(sigma)),
                linewidth=10)

    plt.yticks([0, .01])
    plt.xlabel(r'Order $\lambda$', fontsize=16)
    plt.ylabel(r'RDP value $\varepsilon$ at $\lambda$', fontsize=16)
    ax.tick_params(labelsize=14)

    fout_name = os.path.join(FLAGS.figures_dir, 'rdp_flow1.pdf')
    print('Saving the graph to ' + fout_name)
    fig.savefig(fout_name, bbox_inches='tight')
    plt.legend(loc=0, fontsize=13)
    plt.show()
Пример #10
0
def plot_two_data_ind_curves():
  orders = np.linspace(1., 100., endpoint=True, num=1000)
  orders[0] = 1.001

  fig, ax = setup_plot()

  for sigma in [100, 150]:
    ax.plot(
        orders,
        pate.rdp_data_independent_gaussian(sigma, orders),
        alpha=.3,
        label=r'Data-independent bound, $\sigma$={}'.format(int(sigma)),
        linewidth=10)

  plt.xlim(xmin=1, xmax=100)
  plt.ylim(ymin=0)
  plt.xticks([1, 20, 40, 60, 80, 100])
  plt.yticks([0, .0025, .005, .0075, .01])
  plt.xlabel(r'Order $\alpha$', fontsize=16)
  plt.ylabel(r'RDP value $\varepsilon$ at $\alpha$', fontsize=16)
  ax.tick_params(labelsize=14)

  plt.legend(loc=0, fontsize=13)
  plt.show()
Пример #11
0
def plot_two_data_ind_curves():
  orders = np.linspace(1., 100., endpoint=True, num=1000)
  orders[0] = 1.001

  fig, ax = setup_plot()

  for sigma in [100, 150]:
    ax.plot(
        orders,
        pate.rdp_data_independent_gaussian(sigma, orders),
        alpha=.3,
        label=r'Data-independent bound, $\sigma$={}'.format(int(sigma)),
        linewidth=10)

  plt.xlim(xmin=1, xmax=100)
  plt.ylim(ymin=0)
  plt.xticks([1, 20, 40, 60, 80, 100])
  plt.yticks([0, .0025, .005, .0075, .01])
  plt.xlabel(r'Order $\alpha$', fontsize=16)
  plt.ylabel(r'RDP value $\varepsilon$ at $\alpha$', fontsize=16)
  ax.tick_params(labelsize=14)

  plt.legend(loc=0, fontsize=13)
  plt.show()
Пример #12
0
def _find_optimal_smooth_sensitivity_parameters(
    votes, baseline, num_teachers, threshold, sigma1, sigma2, delta, ind_step1,
    ind_step2, order):
  """Optimizes smooth sensitivity parameters by minimizing a cost function.

  The cost function is
        exact_eps + cost of GNSS + two stds of noise,
  which captures that upper bound of the confidence interval of the sanitized
  privacy budget.

  Since optimization is done with full view of sensitive data, the results
  cannot be released.
  """
  rdp_cum = 0
  answered_cum = 0
  ls_cum = 0

  # Define a plausible range for the beta values.
  betas = np.arange(.3 / order, .495 / order, .01 / order)
  cost_delta = math.log(1 / delta) / (order - 1)

  for i, v in enumerate(votes):
    if threshold is None:
      log_pr_answered = 0
      rdp1 = 0
      ls_step1 = np.zeros(num_teachers)
    else:
      log_pr_answered = pate.compute_logpr_answered(threshold, sigma1,
                                                    v - baseline[i,])
      if ind_step1:  # apply data-independent bound for step 1 (thresholding).
        rdp1 = pate.compute_rdp_data_independent_threshold(sigma1, order)
        ls_step1 = np.zeros(num_teachers)
      else:
        rdp1 = pate.compute_rdp_threshold(log_pr_answered, sigma1, order)
        ls_step1 = pate_ss.compute_local_sensitivity_bounds_threshold(
            v - baseline[i,], num_teachers, threshold, sigma1, order)

    pr_answered = math.exp(log_pr_answered)
    answered_cum += pr_answered

    if ind_step2:  # apply data-independent bound for step 2 (GNMax).
      rdp2 = pate.rdp_data_independent_gaussian(sigma2, order)
      ls_step2 = np.zeros(num_teachers)
    else:
      logq_step2 = pate.compute_logq_gaussian(v, sigma2)
      rdp2 = pate.rdp_gaussian(logq_step2, sigma2, order)
      # Compute smooth sensitivity.
      ls_step2 = pate_ss.compute_local_sensitivity_bounds_gnmax(
          v, num_teachers, sigma2, order)

    rdp_cum += rdp1 + pr_answered * rdp2
    ls_cum += ls_step1 + pr_answered * ls_step2  # Expected local sensitivity.

    if ind_step1 and ind_step2:
      # Data-independent bounds.
      cost_opt, beta_opt, ss_opt, sigma_ss_opt = None, 0., 0., np.inf
    else:
      # Data-dependent bounds.
      cost_opt, beta_opt, ss_opt, sigma_ss_opt = np.inf, None, None, None

      for beta in betas:
        ss = pate_ss.compute_discounted_max(beta, ls_cum)

        # Solution to the minimization problem:
        #   min_sigma {order * exp(2 * beta)/ sigma^2 + 2 * ss * sigma}
        sigma_ss = ((order * math.exp(2 * beta)) / ss)**(1 / 3)
        cost_ss = pate_ss.compute_rdp_of_smooth_sensitivity_gaussian(
            beta, sigma_ss, order)

        # Cost captures exact_eps + cost of releasing SS + two stds of noise.
        cost = rdp_cum + cost_ss + 2 * ss * sigma_ss
        if cost < cost_opt:
          cost_opt, beta_opt, ss_opt, sigma_ss_opt = cost, beta, ss, sigma_ss

    if ((i + 1) % 100 == 0) or (i == votes.shape[0] - 1):
      eps_before_ss = rdp_cum + cost_delta
      eps_with_ss = (
          eps_before_ss + pate_ss.compute_rdp_of_smooth_sensitivity_gaussian(
              beta_opt, sigma_ss_opt, order))
      print('{}: E[answered queries] = {:.1f}, RDP at {} goes from {:.3f} to '
            '{:.3f} +/- {:.3f} (ss = {:.4}, beta = {:.4f}, sigma_ss = {:.3f})'.
            format(i + 1, answered_cum, order, eps_before_ss, eps_with_ss,
                   ss_opt * sigma_ss_opt, ss_opt, beta_opt, sigma_ss_opt))
      sys.stdout.flush()

  # Return optimal parameters for the last iteration.
  return beta_opt, ss_opt, sigma_ss_opt
def _compute_rdp_gnmax(sigma, logq, order):
    logq0 = _compute_logq0(sigma, order)
    if logq >= logq0:
        return pate.rdp_data_independent_gaussian(sigma, order)
    else:
        return _compute_data_dep_bound_gnmax(sigma, logq, order)
 def _compare_dep_vs_ind(logq):
     return (_compute_data_dep_bound_gnmax(sigma, logq, order) -
             pate.rdp_data_independent_gaussian(sigma, order))
Пример #15
0
def _compute_rdp_gnmax(sigma, logq, order):
  logq0 = _compute_logq0(sigma, order)
  if logq >= logq0:
    return pate.rdp_data_independent_gaussian(sigma, order)
  else:
    return _compute_data_dep_bound_gnmax(sigma, logq, order)
Пример #16
0
 def _compare_dep_vs_ind(logq):
   return (_compute_data_dep_bound_gnmax(sigma, logq, order) -
           pate.rdp_data_independent_gaussian(sigma, order))
Пример #17
0
def _find_optimal_smooth_sensitivity_parameters(votes, baseline, num_teachers,
                                                threshold, sigma1, sigma2,
                                                delta, ind_step1, ind_step2,
                                                order):
    """Optimizes smooth sensitivity parameters by minimizing a cost function.

  The cost function is
        exact_eps + cost of GNSS + two stds of noise,
  which captures that upper bound of the confidence interval of the sanitized
  privacy budget.

  Since optimization is done with full view of sensitive data, the results
  cannot be released.
  """
    rdp_cum = 0
    answered_cum = 0
    ls_cum = 0

    # Define a plausible range for the beta values.
    betas = np.arange(.3 / order, .495 / order, .01 / order)
    cost_delta = math.log(1 / delta) / (order - 1)

    for i, v in enumerate(votes):
        if threshold is None:
            log_pr_answered = 0
            rdp1 = 0
            ls_step1 = np.zeros(num_teachers)
        else:
            log_pr_answered = pate.compute_logpr_answered(
                threshold, sigma1, v - baseline[i, ])
            if ind_step1:  # apply data-independent bound for step 1 (thresholding).
                rdp1 = pate.compute_rdp_data_independent_threshold(
                    sigma1, order)
                ls_step1 = np.zeros(num_teachers)
            else:
                rdp1 = pate.compute_rdp_threshold(log_pr_answered, sigma1,
                                                  order)
                ls_step1 = pate_ss.compute_local_sensitivity_bounds_threshold(
                    v - baseline[i, ], num_teachers, threshold, sigma1, order)

        pr_answered = math.exp(log_pr_answered)
        answered_cum += pr_answered

        if ind_step2:  # apply data-independent bound for step 2 (GNMax).
            rdp2 = pate.rdp_data_independent_gaussian(sigma2, order)
            ls_step2 = np.zeros(num_teachers)
        else:
            logq_step2 = pate.compute_logq_gaussian(v, sigma2)
            rdp2 = pate.rdp_gaussian(logq_step2, sigma2, order)
            # Compute smooth sensitivity.
            ls_step2 = pate_ss.compute_local_sensitivity_bounds_gnmax(
                v, num_teachers, sigma2, order)

        rdp_cum += rdp1 + pr_answered * rdp2
        ls_cum += ls_step1 + pr_answered * ls_step2  # Expected local sensitivity.

        if ind_step1 and ind_step2:
            # Data-independent bounds.
            cost_opt, beta_opt, ss_opt, sigma_ss_opt = None, 0., 0., np.inf
        else:
            # Data-dependent bounds.
            cost_opt, beta_opt, ss_opt, sigma_ss_opt = np.inf, None, None, None

            for beta in betas:
                ss = pate_ss.compute_discounted_max(beta, ls_cum)

                # Solution to the minimization problem:
                #   min_sigma {order * exp(2 * beta)/ sigma^2 + 2 * ss * sigma}
                sigma_ss = ((order * math.exp(2 * beta)) / ss)**(1 / 3)
                cost_ss = pate_ss.compute_rdp_of_smooth_sensitivity_gaussian(
                    beta, sigma_ss, order)

                # Cost captures exact_eps + cost of releasing SS + two stds of noise.
                cost = rdp_cum + cost_ss + 2 * ss * sigma_ss
                if cost < cost_opt:
                    cost_opt, beta_opt, ss_opt, sigma_ss_opt = cost, beta, ss, sigma_ss

        if ((i + 1) % 100 == 0) or (i == votes.shape[0] - 1):
            eps_before_ss = rdp_cum + cost_delta
            eps_with_ss = (eps_before_ss +
                           pate_ss.compute_rdp_of_smooth_sensitivity_gaussian(
                               beta_opt, sigma_ss_opt, order))
            print(
                '{}: E[answered queries] = {:.1f}, RDP at {} goes from {:.3f} to '
                '{:.3f} +/- {:.3f} (ss = {:.4}, beta = {:.4f}, sigma_ss = {:.3f})'
                .format(i + 1, answered_cum, order, eps_before_ss, eps_with_ss,
                        ss_opt * sigma_ss_opt, ss_opt, beta_opt, sigma_ss_opt))
            sys.stdout.flush()

    # Return optimal parameters for the last iteration.
    return beta_opt, ss_opt, sigma_ss_opt