示例#1
0
文件: core.py 项目: ArnabKar/statdp
def detect_counterexample(algorithm, test_epsilon, default_kwargs,
                          event_search_space=None, databases=None,
                          event_iterations=100000, detect_iterations=500000, cores=0,
                          loglevel=logging.INFO):
    """
    :param algorithm: The algorithm to test for.
    :param test_epsilon: The privacy budget to test for, can either be a number or a tuple/list.
    :param default_kwargs: The default arguments the algorithm needs except the first Queries argument, 'epsilon' must be provided.
    :param event_search_space: The search space for event selector to reduce search time, optional.
    :param databases: The databases to run for detection, optional.
    :param event_iterations: The iterations for event selector to run, default is 100000.
    :param detect_iterations: The iterations for detector to run, default is 500000.
    :param cores: The cores to utilize, 0 means auto-detection.
    :param loglevel: The loglevel for logging package.
    :return: [(epsilon, p, d1, d2, kwargs, event)] The epsilon-p pairs along with databases/arguments/selected event.
    """
    logging.basicConfig(level=loglevel)
    logger.info('Starting to find counter example on algorithm {} with test epsilon {}\n'
                .format(algorithm.__name__, test_epsilon))
    logger.info('\nExtra arguments:\n'
                'default_kwargs: {}\n'
                'event_search_space: {}\n'
                'databases: {}\n'
                'cores:{}\n'.format(default_kwargs, event_search_space, databases, cores))

    if databases is not None:
        d1, d2 = databases
        kwargs = generate_arguments(algorithm, d1, d2, default_kwargs=default_kwargs)
        input_list = ((d1, d2, kwargs),)
    else:
        input_list = generate_databases(algorithm, 5, default_kwargs=default_kwargs)

    result = []

    test_epsilon = (test_epsilon, ) if isinstance(test_epsilon, (int, float)) else test_epsilon
    pool = None
    if cores == 0:
        pool = mp.Pool(mp.cpu_count())
    elif cores != 1:
        pool = mp.Pool(cores)
    try:
        for i, epsilon in enumerate(test_epsilon):
            d1, d2, kwargs, event = select_event(algorithm, input_list, epsilon, event_iterations,
                                                 search_space=event_search_space, process_pool=pool)

            # fix the database and arguments if selected for performance
            input_list = ((d1, d2, kwargs),) if len(input_list) > 1 else input_list

            p1, _ = hypothesis_test(algorithm, d1, d2, kwargs, event, epsilon, detect_iterations, process_pool=pool)
            result.append((epsilon, p1, d1, d2, kwargs, event))
            print('Epsilon: {} | p-value: {:5.3f} | Event: {} | {:5.1f}%'
                  .format(epsilon, p1, event, float(i + 1) / len(test_epsilon) * 100))
            logger.debug('D1: {} | D2: {} | kwargs: {}'.format(d1, d2, kwargs))
    finally:
        if pool is not None:
            pool.close()
        else:
            pass

    return result
示例#2
0
def test_generate_databases():
    input_list = generate_databases(noisy_max_v1a, 5, {'epsilon': 0.5})
    assert isinstance(input_list, list) and len(input_list) >= 1
    for input_ in input_list:
        assert isinstance(input_, (list, tuple)) and len(input_) == 3
        d1, d2, args = input_
        assert isinstance(d1, (tuple, list)) and isinstance(d2, (tuple, list))
        assert len(d1) == 5 and len(d2) == 5
        assert isinstance(args, (tuple, list, dict))
示例#3
0
def test_generate_databases():
    input_list = generate_databases(noisy_max_v1a, 5, {'epsilon': 0.5})
    assert isinstance(input_list, (list, tuple)) and len(input_list) >= 1
    for input_ in input_list:
        assert isinstance(input_, (list, tuple)) and len(input_) == 3
        d1, d2, args = input_
        assert isinstance(d1, (tuple, list)) and isinstance(d2, (tuple, list))
        assert len(d1) == 5 and len(d2) == 5
        assert isinstance(args, (tuple, list, dict))

    # test ONE_DIFFER
    input_list = generate_databases(histogram,
                                    5, {'epsilon': 0.5},
                                    sensitivity=ONE_DIFFER)
    assert isinstance(input_list, (list, tuple)) and len(input_list) >= 1
    for input_ in input_list:
        assert isinstance(input_, (list, tuple)) and len(input_) == 3
        d1, d2, _ = input_
        assert isinstance(d1, (tuple, list)) and isinstance(d2, (tuple, list))
        assert len(d1) == 5 and len(d2) == 5
        unequal_count = sum(element1 != element2
                            for element1, element2 in zip(d1, d2))
        assert unequal_count == 1
示例#4
0
def detect_counterexample(algorithm,
                          test_epsilon,
                          default_kwargs=None,
                          databases=None,
                          num_input=(5, 10),
                          event_iterations=100000,
                          detect_iterations=500000,
                          cores=0,
                          sensitivity=ALL_DIFFER,
                          quiet=False,
                          loglevel=logging.INFO):
    """
    :param algorithm: The algorithm to test for.
    :param test_epsilon: The privacy budget to test for, can either be a number or a tuple/list.
    :param default_kwargs: The default arguments the algorithm needs except the first Queries argument.
    :param databases: The databases to run for detection, optional.
    :param num_input: The length of input to generate, not used if database param is specified.
    :param event_iterations: The iterations for event selector to run, default is 100000.
    :param detect_iterations: The iterations for detector to run, default is 500000.
    :param cores: The cores to utilize, 0 means auto-detection.
    :param sensitivity: The sensitivity setting, all queries can differ by one or just one query can differ by one.
    :param quiet: Do not print progress bar or messages, logs are not affected, default is False.
    :param loglevel: The loglevel for logging package.
    :return: [(epsilon, p, d1, d2, kwargs, event)] The epsilon-p pairs along with databases/arguments/selected event.
    """
    # initialize an empty default kwargs if None is given
    default_kwargs = default_kwargs if default_kwargs else {}

    logging.basicConfig(level=loglevel)
    logger.info(
        'Start detection for counterexample on algorithm {} with test epsilon {}'
        .format(algorithm.__name__, test_epsilon))
    logger.info(
        'Options -> default_kwargs: {} | databases: {} | cores:{}'.format(
            default_kwargs, databases, cores))

    # log warnings about gsl installation
    if use_gsl:
        logger.info(
            'Found GSL installation, using GSL implementation of hypergeom.cdf for better performance.'
        )
    else:
        logger.warning(
            'Did not find Gnu Scientific Library (GSL) installation, falling back to scipy implementation of '
            'hypergeom.cdf. Note that GSL provides much faster implementation than scipy which can '
            'significantly increase detection performance.')

    input_list = []
    if databases is not None:
        d1, d2 = databases
        kwargs = generate_arguments(algorithm,
                                    d1,
                                    d2,
                                    default_kwargs=default_kwargs)
        input_list = ((d1, d2, kwargs), )
    else:
        num_input = (int(num_input), ) if isinstance(num_input,
                                                     (int,
                                                      float)) else num_input
        for num in num_input:
            input_list.extend(
                generate_databases(algorithm,
                                   num,
                                   default_kwargs=default_kwargs,
                                   sensitivity=sensitivity))

    result = []

    # convert int/float or iterable into tuple (so that it has length information)
    test_epsilon = (test_epsilon, ) if isinstance(test_epsilon,
                                                  (int,
                                                   float)) else test_epsilon

    pool = mp.Pool(mp.cpu_count()) if cores == 0 else (
        mp.Pool(cores) if cores != 1 else None)
    try:
        for _, epsilon in tqdm.tqdm(enumerate(test_epsilon),
                                    total=len(test_epsilon),
                                    unit='test',
                                    desc='Detection',
                                    disable=quiet):
            d1, d2, kwargs, event = select_event(algorithm,
                                                 input_list,
                                                 epsilon,
                                                 event_iterations,
                                                 quiet=quiet,
                                                 process_pool=pool)
            p = hypothesis_test(algorithm,
                                d1,
                                d2,
                                kwargs,
                                event,
                                epsilon,
                                detect_iterations,
                                report_p2=False,
                                process_pool=pool)
            result.append((epsilon, float(p), d1, d2, kwargs, event))
            if not quiet:
                tqdm.tqdm.write(
                    'Epsilon: {} | p-value: {:5.3f} | Event: {}'.format(
                        epsilon, p, event))
            logger.debug('D1: {} | D2: {} | kwargs: {}'.format(d1, d2, kwargs))
    finally:
        if pool:
            pool.close()
            pool.join()
    return result
示例#5
0
def detect_counterexample(algorithm,
                          test_epsilon,
                          default_kwargs=None,
                          databases=None,
                          num_input=(5, 10),
                          event_iterations=100000,
                          detect_iterations=500000,
                          cores=None,
                          sensitivity=ALL_DIFFER,
                          quiet=False,
                          loglevel=logging.INFO):
    """
    :param algorithm: The algorithm to test for.
    :param test_epsilon: The privacy budget to test for, can either be a number or a tuple/list.
    :param default_kwargs: The default arguments the algorithm needs except the first Queries argument.
    :param databases: The databases to run for detection, optional.
    :param num_input: The length of input to generate, not used if database param is specified.
    :param event_iterations: The iterations for event selector to run.
    :param detect_iterations: The iterations for detector to run.
    :param cores: The number of max processes to set for multiprocessing.Pool(), os.cpu_count() is used if None.
    :param sensitivity: The sensitivity setting, all queries can differ by one or just one query can differ by one.
    :param quiet: Do not print progress bar or messages, logs are not affected.
    :param loglevel: The loglevel for logging package.
    :return: [(epsilon, p, d1, d2, kwargs, event)] The epsilon-p pairs along with databases/arguments/selected event.
    """
    # initialize an empty default kwargs if None is given
    default_kwargs = default_kwargs if default_kwargs else {}

    logging.basicConfig(level=loglevel)
    logger.info(
        f'Start detection for counterexample on {algorithm.__name__} with test epsilon {test_epsilon}'
    )
    logger.info(
        f'Options -> default_kwargs: {default_kwargs} | databases: {databases} | cores:{cores}'
    )

    input_list = []
    if databases is not None:
        d1, d2 = databases
        kwargs = generate_arguments(algorithm,
                                    d1,
                                    d2,
                                    default_kwargs=default_kwargs)
        input_list = ((d1, d2, kwargs), )
    else:
        num_input = (int(num_input), ) if isinstance(num_input,
                                                     (int,
                                                      float)) else num_input
        for num in num_input:
            input_list.extend(
                generate_databases(algorithm,
                                   num,
                                   default_kwargs=default_kwargs,
                                   sensitivity=sensitivity))

    # ------------ BEGIN EDITS ------------
    new_input_list = []
    for db in input_list:
        d1, d2, kwargs = db
        new_kwargs = kwargs.copy()
        # remember the first input (for HammingDistance postprocessing)
        new_kwargs['_d1'] = d1
        new_input_list.append((d1, d2, new_kwargs))
    input_list = new_input_list
    # ------------ END EDITS ------------

    result = []

    # convert int/float or iterable into tuple (so that it has length information)
    test_epsilon = (test_epsilon, ) if isinstance(test_epsilon,
                                                  (int,
                                                   float)) else test_epsilon

    with mp.Pool(cores) as pool:
        for _, epsilon in tqdm.tqdm(enumerate(test_epsilon),
                                    total=len(test_epsilon),
                                    unit='test',
                                    desc='Detection',
                                    disable=quiet):
            d1, d2, kwargs, event = select_event(algorithm,
                                                 input_list,
                                                 epsilon,
                                                 event_iterations,
                                                 quiet=quiet,
                                                 process_pool=pool)
            p = hypothesis_test(algorithm,
                                d1,
                                d2,
                                kwargs,
                                event,
                                epsilon,
                                detect_iterations,
                                report_p2=False,
                                process_pool=pool)
            result.append((epsilon, float(p), d1, d2, kwargs, event))
            if not quiet:
                tqdm.tqdm.write(
                    f'Epsilon: {epsilon} | p-value: {p:5.3f} | Event: {event}')
            logger.debug(f'D1: {d1} | D2: {d2} | kwargs: {kwargs}')

        return result
示例#6
0
def detect_counterexample(algorithm, test_epsilon, default_kwargs=None, databases=None, num_input=(5, 10),
                          event_iterations=100000, detect_iterations=500000, cores=0,
                          quiet=False, loglevel=logging.INFO):
    """
    :param algorithm: The algorithm to test for.
    :param test_epsilon: The privacy budget to test for, can either be a number or a tuple/list.
    :param default_kwargs: The default arguments the algorithm needs except the first Queries argument.
    :param databases: The databases to run for detection, optional.
    :param num_input: The length of input to generate, not used if database param is specified.
    :param event_iterations: The iterations for event selector to run, default is 100000.
    :param detect_iterations: The iterations for detector to run, default is 500000.
    :param cores: The cores to utilize, 0 means auto-detection.
    :param quiet: Do not print progress bar or messages, logs are not affected, default is False.
    :param loglevel: The loglevel for logging package.
    :return: [(epsilon, p, d1, d2, kwargs, event)] The epsilon-p pairs along with databases/arguments/selected event.
    """
    # initialize an empty default kwargs if None is given
    default_kwargs = default_kwargs if default_kwargs else {}

    logging.basicConfig(level=loglevel)
    logger.info('Starting to find counterexample on algorithm {} with test epsilon {}'
                .format(algorithm.__name__, test_epsilon))
    logger.info('Options -> default_kwargs: {} | databases: {} | cores:{}'.format(default_kwargs, databases, cores))

    input_list = []
    if databases is not None:
        d1, d2 = databases
        kwargs = generate_arguments(algorithm, d1, d2, default_kwargs=default_kwargs)
        input_list = ((d1, d2, kwargs),)
    else:
        num_input = (int(num_input), ) if isinstance(num_input, (int, float)) else num_input
        for num in num_input:
            input_list.extend(generate_databases(algorithm, num, default_kwargs=default_kwargs))

    result = []

    # convert int/float or iterable into tuple (so that it has length information)
    test_epsilon = (test_epsilon, ) if isinstance(test_epsilon, (int, float)) else test_epsilon
    if isinstance(test_epsilon, (int, float)):
        test_epsilon = (test_epsilon, )
    elif not isinstance(test_epsilon, (tuple, list)):
        test_epsilon = tuple(test_epsilon)

    pool = mp.Pool(mp.cpu_count()) if cores == 0 else (mp.Pool(cores) if cores != 1 else None)
    try:
        for i, epsilon in tqdm.tqdm(enumerate(test_epsilon), total=len(test_epsilon), unit='test', desc='Detection'):
            d1, d2, kwargs, event = select_event(algorithm, input_list, epsilon, event_iterations, quiet=quiet,
                                                 process_pool=pool)
            p = hypothesis_test(algorithm, d1, d2, kwargs, event, epsilon, detect_iterations,
                                report_p2=False, process_pool=pool)
            result.append((epsilon, p, d1, d2, kwargs, event))
            tqdm.tqdm.write('Epsilon: {} | p-value: {:5.3f} | Event: {}'
                            .format(epsilon, p, event))
            logger.debug('D1: {} | D2: {} | kwargs: {}'.format(d1, d2, kwargs))
    finally:
        if pool:
            pool.close()
            pool.join()
        else:
            pass

    return result