Пример #1
0
def evaluate(data,
             range_max_value,
             quality_function,
             quality_promise,
             approximation,
             eps,
             delta,
             intervals_bounding,
             max_in_interval,
             use_exponential=True):
    """
    RecConcave algorithm for the specific case of N=2
    :param data: the main data-set
    :param range_max_value: maximum possible output (the minimum output is 0)
    :param quality_function: function that gets a domain-elements and returns its quality (in float)
    :param quality_promise: float, quality value that we can assure that there exist a domain element with at least that quality
    :param approximation: 0 < float < 1. the approximation level of the result
    :param eps: float > 0. privacy parameter
    :param delta: 1 > float > 0. privacy parameter
    :param intervals_bounding: function L(data,domain_element)
    :param max_in_interval: function u(data,interval) that returns the maximum of quality_function(data,j)
    for j in the interval
    :param use_exponential: the original version uses A_dist mechanism. for utility reasons the exponential-mechanism
    is the default. turn to False to use A_dist instead
    :return: an element of domain with approximately maximum value of quality function
    """

    # step 2
    # print "step 2"
    log_of_range = int(math.ceil(math.log(range_max_value, 2)))
    range_max_value_tag = 2**log_of_range

    def extended_quality_function(data_base, j):
        if range_max_value < j <= range_max_value_tag:
            return min(0, quality_function(data_base, range_max_value))
        else:
            return quality_function(data_base, j)

    # step 4
    # print "step 4"

    def recursive_quality_function(data_base, j):
        return min(
            intervals_bounding(data_base, range_max_value_tag, j) -
            (1 - approximation) * quality_promise, quality_promise -
            intervals_bounding(data_base, range_max_value_tag, j + 1))

    # step 6
    # print "step 6"
    recursion_returned = basicdp.exponential_mechanism_big(
        data, range(log_of_range + 1), recursive_quality_function, eps)

    good_interval = 8 * (2**recursion_returned)
    # print "good interval: %d" % good_interval

    # step 7
    # print "step 7"
    first_intervals = __build_intervals_set__(data, good_interval, 0,
                                              range_max_value_tag)
    second_intervals = __build_intervals_set__(data, good_interval, 0,
                                               range_max_value_tag, True)
    max_quality = partial(max_in_interval, interval_length=good_interval)

    # step 9 ( using 'dist' algorithm )
    # print "step 9"
    # TODO should I add switch for sparse?
    # TODO make sure it is still generic!!!!!!!!!!!!!!
    if use_exponential:
        first_full_domain = xrange(0, range_max_value, good_interval)
        second_full_domain = xrange(good_interval / 2, range_max_value,
                                    good_interval)
        first_chosen_interval = basicdp.sparse_domain(
            basicdp.exponential_mechanism_big, data, first_full_domain,
            first_intervals, max_quality, eps)
        second_chosen_interval = basicdp.sparse_domain(
            basicdp.exponential_mechanism_big, data, second_full_domain,
            second_intervals, max_quality, eps)
    else:
        first_chosen_interval = basicdp.a_dist(data, first_intervals,
                                               max_quality, eps, delta)
        second_chosen_interval = basicdp.a_dist(data, second_intervals,
                                                max_quality, eps, delta)

    if type(first_chosen_interval) == str and type(
            second_chosen_interval) == str:
        raise ValueError("stability problem, try taking more samples!")

    # step 10
    # print "step 10"
    if type(first_chosen_interval) == str:
        first_chosen_interval_as_list = []
    else:
        first_chosen_interval_as_list = range(
            first_chosen_interval, first_chosen_interval + good_interval)
    if type(second_chosen_interval) == str:
        second_chosen_interval_as_list = []
    else:
        second_chosen_interval_as_list = range(
            second_chosen_interval, second_chosen_interval + good_interval)

    return basicdp.exponential_mechanism_big(
        data, first_chosen_interval_as_list + second_chosen_interval_as_list,
        extended_quality_function, eps)
Пример #2
0
def __rec_sanitize__(samples, domain_range, alpha, beta, eps, delta, dimension):
    # print domain_range
    # print calls
    global calls
    global san_data
    # step 1
    if calls == 0:
        return
    calls -= 1

    # step 2
    # the use of partial is redundant
    samples_domain_points = partial(points_in_subset, samples)
    noisy_points_in_range = samples_domain_points(subset=domain_range) + laplace(0, 1/eps, 1)
    sample_size = len(samples)

    # step 3
    if noisy_points_in_range < alpha*sample_size/8:
        base_range = domain_range
        san_data.extend(base_range[1] * noisy_points_in_range)
        return san_data

    # step 4
    domain_size = domain_range[1] - domain_range[0] + 1
    log_size = int(ceil(log(domain_size, 2)))
    # not needed
    # size_tag = 2**log_size

    # step 6

    def quality(data, j):
        return min(point_count_intervals_bounding(data, domain_range, j)-alpha * sample_size / 32,
                3 * alpha * sample_size / 32 - point_count_intervals_bounding(data, domain_range, j-1))

    # not needed if using exponential_mechanism
    # step 7
    # promise = alpha * sample_size / 32

    # step 8
    new_eps = eps/3/log_star(dimension)
    # new_delta = delta/3/log_star(dimension)
    # note the use of exponential_mechanism instead of rec_concave
    z_tag = exponential_mechanism(samples, range(log_size+1), quality, new_eps)
    z = 2 ** z_tag

    # step 9
    if z_tag == 0:
        point_counter = Counter(samples)

        def special_quality(data, b):
            return point_counter[b]

        b = choosing_mechanism(samples, range(domain_range[0], domain_range[1] + 1), special_quality,
                               1, alpha/64., beta, eps, delta)
        a = b
    # step 10
    else:
        first_intervals = __build_intervals_set__(samples, 2*z, domain_range[0], domain_range[1] + 1)
        second_intervals = __build_intervals_set__(samples, 2*z_tag, domain_range[0], domain_range[1] + 1, True)
        intervals = [(i, i+2*z-1) for i in first_intervals+second_intervals]
        a, b = choosing_mechanism(samples, intervals, points_in_subset, 2, alpha/64., beta, eps, delta)

    if type(a) == str:
        raise ValueError("stability problem - choosing_mechanism returned 'bottom'")

    # step 11
    # although not mentioned I assume the noisy value should be rounded
    noisy_count_ab = int(samples_domain_points((a, b)) + laplace(0, 1/eps, 1))
    san_data.extend([b] * noisy_count_ab)

    # step 12
    if a > domain_range[0]:
        rec_range = (domain_range[0], a - 1)
        __rec_sanitize__(samples, rec_range, alpha, beta, eps, delta, dimension)
    if b < domain_range[1]:
        rec_range = (b + 1, domain_range[1])
        __rec_sanitize__(samples, rec_range, alpha, beta, eps, delta, dimension)
    return san_data
Пример #3
0
def evaluate(
    data,
    range_max_value,
    quality_function,
    quality_promise,
    approximation,
    eps,
    delta,
    intervals_bounding,
    max_in_interval,
    use_exponential=True,
):
    """
    RecConcave algorithm for the specific case of N=2
    :param data: the main data-set
    :param range_max_value: maximum possible output (the minimum output is 0)
    :param quality_function: function that gets a domain-elements and returns its quality (in float)
    :param quality_promise: float, quality value that we can assure that there exist a domain element with at least that quality
    :param approximation: 0 < float < 1. the approximation level of the result
    :param eps: float > 0. privacy parameter
    :param delta: 1 > float > 0. privacy parameter
    :param intervals_bounding: function L(data,domain_element)
    :param max_in_interval: function u(data,interval) that returns the maximum of quality_function(data,j)
    for j in the interval
    :param use_exponential: the original version uses A_dist mechanism. for utility reasons the exponential-mechanism
    is the default. turn to False to use A_dist instead
    :return: an element of domain with approximately maximum value of quality function
    """

    # step 2
    # print "step 2"
    log_of_range = int(math.ceil(math.log(range_max_value, 2)))
    range_max_value_tag = 2 ** log_of_range

    def extended_quality_function(data_base, j):
        if range_max_value < j <= range_max_value_tag:
            return min(0, quality_function(data_base, range_max_value))
        else:
            return quality_function(data_base, j)

    # step 4
    # print "step 4"

    def recursive_quality_function(data_base, j):
        return min(
            intervals_bounding(data_base, range_max_value_tag, j) - (1 - approximation) * quality_promise,
            quality_promise - intervals_bounding(data_base, range_max_value_tag, j + 1),
        )

    # step 6
    # print "step 6"
    recursion_returned = basicdp.exponential_mechanism_big(
        data, range(log_of_range + 1), recursive_quality_function, eps
    )

    good_interval = 8 * (2 ** recursion_returned)
    # print "good interval: %d" % good_interval

    # step 7
    # print "step 7"
    first_intervals = __build_intervals_set__(data, good_interval, 0, range_max_value_tag)
    second_intervals = __build_intervals_set__(data, good_interval, 0, range_max_value_tag, True)
    max_quality = partial(max_in_interval, interval_length=good_interval)

    # step 9 ( using 'dist' algorithm )
    # print "step 9"
    # TODO should I add switch for sparse?
    # TODO make sure it is still generic!!!!!!!!!!!!!!
    if use_exponential:
        first_full_domain = xrange(0, range_max_value, good_interval)
        second_full_domain = xrange(good_interval / 2, range_max_value, good_interval)
        first_chosen_interval = basicdp.sparse_domain(
            basicdp.exponential_mechanism_big, data, first_full_domain, first_intervals, max_quality, eps
        )
        second_chosen_interval = basicdp.sparse_domain(
            basicdp.exponential_mechanism_big, data, second_full_domain, second_intervals, max_quality, eps
        )
    else:
        first_chosen_interval = basicdp.a_dist(data, first_intervals, max_quality, eps, delta)
        second_chosen_interval = basicdp.a_dist(data, second_intervals, max_quality, eps, delta)

    if type(first_chosen_interval) == str and type(second_chosen_interval) == str:
        raise ValueError("stability problem, try taking more samples!")

    # step 10
    # print "step 10"
    if type(first_chosen_interval) == str:
        first_chosen_interval_as_list = []
    else:
        first_chosen_interval_as_list = range(first_chosen_interval, first_chosen_interval + good_interval)
    if type(second_chosen_interval) == str:
        second_chosen_interval_as_list = []
    else:
        second_chosen_interval_as_list = range(second_chosen_interval, second_chosen_interval + good_interval)

    return basicdp.exponential_mechanism_big(
        data, first_chosen_interval_as_list + second_chosen_interval_as_list, extended_quality_function, eps
    )