Exemplo n.º 1
0
def black_box_rejector(options):
    """
    Uses a black box attack to evade the rejector defense.

    Adversarial samples are generated to fool the defended model,
    which only provides the labels when queried.
    Note: Models with rejectors also have a special label 'reject',
    which does not represent a valid misclassification (i.e. the attack
    does not considered being rejected a success).
    """
    attack_p = options['attack_p']
    attack_name = options['attack_name']
    attack_workers = options['attack_workers']
    command = options['command']
    cuda = options['cuda']
    foolbox_model = options['foolbox_model']
    loader = options['loader']
    rejector = options['rejector']
    results_path = options['results_path']

    # The defended_model returns [y1, y2 ... yN, -inf] if it believes
    # that the sample is valid, otherwise it returns [0, 0 ... 0, 1]
    # This means that if the top label is the last one, it was classified as adversarial.
    # On a genuine dataset, this should never happen (if the rejector is perfect).

    defended_model = rejectors.RejectorModel(foolbox_model, rejector)

    # detectors.Undetected() adds the condition that the top label must not be the last
    # Note: (foolbox.Criterion and foolbox.Criterion) should give a combined criterion, but
    # apparently it doesn't work. The documentation recommends using "&"

    criterion = foolbox.criteria.CombinedCriteria(
        foolbox.criteria.Misclassification(), rejectors.Unrejected())

    # The attack will be against the defended model

    attack = parsing.parse_attack(attack_name, attack_p, criterion)

    samples_count, correct_count, successful_attack_count, distances, _, _ = tests.attack_test(
        defended_model,
        loader,
        attack,
        attack_p,
        cuda,
        attack_workers,
        name='Black-Box Rejector Attack')

    info = utils.attack_statistics_info(samples_count, correct_count,
                                        successful_attack_count, distances)

    header = ['Distances']

    utils.save_results(results_path,
                       table=[distances],
                       command=command,
                       info=info,
                       header=header)
Exemplo n.º 2
0
def substitute_preprocessor(options):
    """
    Uses BPDA with a substitute model to evade the preprocessor defense.

    BPDA uses predictions from the defended model and gradients
    from the substitute model.
    """
    attack_p = options['attack_p']
    attack_name = options['attack_name']
    attack_workers = options['attack_workers']
    command = options['command']
    cuda = options['cuda']
    foolbox_model = options['foolbox_model']
    loader = options['loader']
    results_path = options['results_path']
    preprocessor = options['preprocessor']
    substitute_foolbox_model = options['substitute_foolbox_model']

    defended_model = defenses.PreprocessorDefenseModel(foolbox_model,
                                                       preprocessor)

    if substitute_foolbox_model.num_classes() != defended_model.num_classes():
        raise click.BadArgumentUsage(
            'The substitute model ({} classes) must have the same '
            'number of classes as the defended model ({} classes)'.format(
                substitute_foolbox_model.num_classes(),
                defended_model.num_classes()))

    composite_model = foolbox.models.CompositeModel(defended_model,
                                                    substitute_foolbox_model)

    criterion = foolbox.criteria.Misclassification()

    # The attack will be against the defended model with estimated gradients

    attack = parsing.parse_attack(attack_name, attack_p, criterion)

    samples_count, correct_count, successful_attack_count, distances, _, _ = tests.attack_test(
        composite_model,
        loader,
        attack,
        attack_p,
        cuda,
        attack_workers,
        name='Substitute Preprocessor Attack')

    info = utils.attack_statistics_info(samples_count, correct_count,
                                        successful_attack_count, distances)

    header = ['Distances']

    utils.save_results(results_path,
                       table=[distances],
                       command=command,
                       info=info,
                       header=header)
Exemplo n.º 3
0
def substitute_model(options):
    """
    Uses BPDA with a substitute model to attack the custom model.

    BPDA uses predictions from the defended model and gradients
    from the substitute model.
    Note: We could technically attack the custom model directly,
    since most models support gradient computation, but we are
    assuming that we do not have access to the gradients. 
    """
    attack_p = options['attack_p']
    attack_name = options['attack_name']
    attack_workers = options['attack_workers']
    command = options['command']
    cuda = options['cuda']
    custom_foolbox_model = options['custom_foolbox_model']
    loader = options['loader']
    results_path = options['results_path']
    substitute_foolbox_model = options['substitute_foolbox_model']

    if substitute_foolbox_model.num_classes(
    ) != custom_foolbox_model.num_classes():
        raise click.BadArgumentUsage(
            'The substitute model ({} classes) must have the same '
            'number of classes as the custom model ({} classes)'.format(
                substitute_foolbox_model.num_classes(),
                custom_foolbox_model.num_classes()))

    composite_model = foolbox.models.CompositeModel(custom_foolbox_model,
                                                    substitute_foolbox_model)

    criterion = foolbox.criteria.Misclassification()

    # The attack will be against the substitute model with estimated gradients

    attack = parsing.parse_attack(attack_name, attack_p, criterion)

    samples_count, correct_count, successful_attack_count, distances, _, _ = tests.attack_test(
        composite_model,
        loader,
        attack,
        attack_p,
        cuda,
        attack_workers,
        name='Substitute Model Attack')

    info = utils.attack_statistics_info(samples_count, correct_count,
                                        successful_attack_count, distances)

    header = ['Distances']

    utils.save_results(results_path,
                       table=[distances],
                       command=command,
                       info=info,
                       header=header)
Exemplo n.º 4
0
def adversarial_perturbation(options):
    attack_name = options['attack_name']
    attack_p = options['attack_p']
    attack_workers = options['attack_workers']
    command = options['command']
    cuda = options['cuda']
    foolbox_model = options['foolbox_model']
    loader = options['loader']
    results_path = options['results_path']

    criterion = foolbox.criteria.Misclassification()

    attack = parsing.parse_attack(attack_name, attack_p, criterion)

    distance_tool = parsing.parse_distance_tool('counter-attack', options,
                                                np.inf)

    samples_count, correct_count, successful_count, \
    correct_estimate_count, boundary_distances,\
    adversarial_distances = tests.adversarial_perturbation_test(foolbox_model, loader, attack, distance_tool, cuda, attack_workers)

    correct_estimate_rate = correct_estimate_count / successful_count
    effective_correct_estimate_rate = correct_estimate_count / correct_count

    info = [
        ['Total Count', samples_count],
        ['Correctly Classified Count', correct_count],
        ['Successful Attack Count', successful_count],
        ['Correct Estimate Count', correct_estimate_count],
        [
            'Correct Estimate Rate (correct_estimate / successful_attack)',
            '{:2.2f}%'.format(correct_estimate_rate * 100.0)
        ],
        [
            'Effective Correct Estimate Rate (correct_estimate / correct_classification)',
            '{:2.2f}%'.format(effective_correct_estimate_rate * 100.0)
        ]
    ]

    header = ['Boundary Distances', 'Adversarial Distances']

    utils.save_results(results_path,
                       table=[boundary_distances, adversarial_distances],
                       header=header,
                       command=command,
                       info=info)
Exemplo n.º 5
0
def shallow_preprocessor(options):
    """
    Simply evaluates the effectiveness of the preprocessor defense, without additional
    attack strategies.
    
    Adversarial samples are generated to fool the undefended model.
    """
    attack_p = options['attack_p']
    attack_name = options['attack_name']
    attack_workers = options['attack_workers']
    command = options['command']
    cuda = options['cuda']
    foolbox_model = options['foolbox_model']
    loader = options['loader']
    results_path = options['results_path']
    preprocessor = options['preprocessor']

    criterion = foolbox.criteria.Misclassification()

    # The attack will be against the undefended model

    attack = parsing.parse_attack(attack_name, attack_p, criterion)

    defended_model = defenses.PreprocessorDefenseModel(foolbox_model,
                                                       preprocessor)

    samples_count, correct_count, successful_attack_count, distances = tests.shallow_defense_test(
        foolbox_model,
        loader,
        attack,
        attack_p,
        defended_model,
        cuda,
        attack_workers,
        name='Shallow Preprocessor Attack')

    info = utils.attack_statistics_info(samples_count, correct_count,
                                        successful_attack_count, distances)

    header = ['Distances']

    utils.save_results(results_path,
                       table=[distances],
                       command=command,
                       info=info,
                       header=header)
Exemplo n.º 6
0
def radius(options, sampling_count):
    command = options['command']
    foolbox_model = options['foolbox_model']
    loader = options['loader']
    results_path = options['results_path']

    distance_tool = parsing.parse_distance_tool('counter-attack', options,
                                                np.inf)

    total_count, consistent_count, failures = tests.radius_test(
        foolbox_model, loader, distance_tool, sampling_count)
    consistency_rate = consistent_count / total_count

    info = [['Total Samples', total_count], ['Failures', failures],
            ['Consistent Sample', consistent_count],
            ['Consistency Rate', '{:.2f}%'.format(consistency_rate * 100.0)]]

    utils.save_results(results_path, command=command, info=info)
Exemplo n.º 7
0
def black_box_model(options):
    """
    Uses a black box attack against the custom model.

    Adversarial samples are generated to fool the custom model,
    which only provides the labels when queried.

    Note: We could technically use the gradients,
    since most models support gradient computation, but we are
    assuming that we do not have access to them. 
    """
    attack_p = options['attack_p']
    attack_name = options['attack_name']
    attack_workers = options['attack_workers']
    command = options['command']
    cuda = options['cuda']
    custom_foolbox_model = options['custom_foolbox_model']
    loader = options['loader']
    results_path = options['results_path']

    criterion = foolbox.criteria.Misclassification()

    # The attack will be against the defended (custom) model

    attack = parsing.parse_attack(attack_name, attack_p, criterion)

    samples_count, correct_count, successful_attack_count, distances, _, _ = tests.attack_test(
        custom_foolbox_model,
        loader,
        attack,
        attack_p,
        cuda,
        attack_workers,
        name='Black-Box Model Attack')

    info = utils.attack_statistics_info(samples_count, correct_count,
                                        successful_attack_count, distances)
    header = ['Distances']

    utils.save_results(results_path,
                       table=[distances],
                       command=command,
                       info=info,
                       header=header)
Exemplo n.º 8
0
def black_box_preprocessor(options):
    """
    Uses a black box attack to evade the preprocessor defense.

    Adversarial samples are generated to fool the defended model,
    which only provides the labels when queried.
    """
    attack_p = options['attack_p']
    attack_name = options['attack_name']
    attack_workers = options['attack_workers']
    command = options['command']
    cuda = options['cuda']
    foolbox_model = options['foolbox_model']
    loader = options['loader']
    results_path = options['results_path']
    preprocessor = options['preprocessor']

    defended_model = defenses.PreprocessorDefenseModel(foolbox_model,
                                                       preprocessor)

    criterion = foolbox.criteria.Misclassification()

    # The attack will be against the defended model
    attack = parsing.parse_attack(attack_name, attack_p, criterion)

    samples_count, correct_count, successful_attack_count, distances, _, _ = tests.attack_test(
        defended_model,
        loader,
        attack,
        attack_p,
        cuda,
        attack_workers,
        name='Black-Box Preprocessor Attack')

    info = utils.attack_statistics_info(samples_count, correct_count,
                                        successful_attack_count, distances)

    header = ['Distances']

    utils.save_results(results_path,
                       table=[distances],
                       command=command,
                       info=info,
                       header=header)
Exemplo n.º 9
0
def boundary_distance(options, max_radius, generation_workers):
    command = options['command']
    cuda = options['cuda']
    foolbox_model = options['foolbox_model']
    loader = options['loader']
    results_path = options['results_path']

    distance_tool = parsing.parse_distance_tool('counter-attack', options,
                                                np.inf)

    samples_count, consistent_count, failure_count, inconsistent_differences = tests.boundary_distance_test(
        foolbox_model,
        loader,
        distance_tool,
        max_radius,
        cuda,
        generation_workers,
        name='Boundary Distance Consistency Test')

    failure_rate = failure_count / samples_count
    consistency_rate = consistent_count / samples_count
    effective_consistency_rate = consistent_count / (samples_count -
                                                     failure_count)
    average_difference = np.average(inconsistent_differences)
    median_difference = np.median(inconsistent_differences)

    info = [['Total Samples', samples_count],
            ['Failed Measure Samples', failure_count],
            ['Consistent Sample', consistent_count],
            ['Failure Rate', '{:.2f}%'.format(failure_rate * 100.0)],
            ['Consistency Rate', '{:.2f}%'.format(consistency_rate * 100.0)],
            [
                'Effective Consistency Rate',
                '{:.2f}%'.format(effective_consistency_rate * 100.0)
            ], ['Average Difference', '{:2.2e}'.format(average_difference)],
            ['Median Difference', '{:2.2e}'.format(median_difference)]]

    header = ['Inconsistent Differences']
    utils.save_results(results_path,
                       table=[inconsistent_differences],
                       header=header,
                       command=command,
                       info=info)
Exemplo n.º 10
0
def accuracy(options, top_ks):
    """
    Computes the accuracy of the model.

    \b
    Stores the following results:
        Top-K Accuracies: The accuracies, where k values are configurable with --top-ks.
    """

    command = options['command']
    foolbox_model = options['foolbox_model']
    loader = options['loader']
    results_path = options['results_path']

    accuracies = tests.accuracy_test(foolbox_model, loader, top_ks)

    info = [[
        'Top-{} Accuracy:'.format(top_k), '{:2.2f}%'.format(accuracy * 100.0)
    ] for top_k, accuracy in zip(top_ks, accuracies)]
    utils.save_results(results_path, command=command, info=info)
Exemplo n.º 11
0
def check_parallelization(options):
    """
    Compares parallelized attacks with standard ones.

    This is a sanity check to verify that attack parallelization does not seriously
    affect the results.
    """

    attack_p = options['attack_p']
    attack_name = options['attack_name']
    attack_workers = options['attack_workers']
    command = options['command']
    cuda = options['cuda']
    foolbox_model = options['foolbox_model']
    results_path = options['results_path']
    loader = options['loader']

    criterion = foolbox.criteria.Misclassification()

    attack = parsing.parse_attack(attack_name, attack_p, criterion)

    samples_count, correct_count, standard_attack_count, parallel_attack_count, standard_distances, parallel_distances = tests.parallelization_test(
        foolbox_model, loader, attack, attack_p, cuda, attack_workers)

    standard_failure_count = correct_count - standard_attack_count
    parallel_failure_count = correct_count - parallel_attack_count

    standard_average_distance, standard_median_distance, standard_adjusted_median_distance = utils.distance_statistics(
        standard_distances, standard_failure_count)
    parallel_average_distance, parallel_median_distance, parallel_adjusted_median_distance = utils.distance_statistics(
        parallel_distances, parallel_failure_count)

    standard_success_rate = standard_attack_count / correct_count
    parallel_success_rate = parallel_attack_count / correct_count

    average_distance_difference = (
        parallel_average_distance -
        standard_average_distance) / standard_average_distance
    median_distance_difference = (
        parallel_median_distance -
        standard_median_distance) / standard_median_distance
    success_rate_difference = (parallel_success_rate -
                               standard_success_rate) / standard_success_rate
    adjusted_median_distance_difference = (
        parallel_adjusted_median_distance -
        standard_adjusted_median_distance) / standard_adjusted_median_distance

    info = [[
        'Average Distance Relative Difference', average_distance_difference
    ], ['Median Distance Relative Difference', median_distance_difference],
            ['Success Rate Difference', success_rate_difference],
            [
                'Adjusted Median Distance Difference',
                adjusted_median_distance_difference
            ], ['Samples Count', str(samples_count)],
            ['Correct Count', str(correct_count)],
            ['Standard Attack Count',
             str(standard_attack_count)],
            ['Parallel Attack Count',
             str(parallel_attack_count)]]

    header = ['Standard Distances', 'Parallel Distances']

    utils.save_results(results_path,
                       table=[standard_distances, parallel_distances],
                       command=command,
                       info=info,
                       header=header)
Exemplo n.º 12
0
def detector_roc(options, score_dataset_path, no_test_warning):
    """
    Uses a detector to identify adversarial samples and computes the ROC curve.

    \b
    Stores the following results:
        ROC Area Under Curve (ROC-AUC)
        Best Threshold: The threshold with the best Youden Index (TPR - FPR)
        Best Threshold True Positive Rate: The TPR at the best threshold
        Best Threshold False Positive Rate: The FPR at the best threshold

        Genuine Scores: All the scores computed for the genuine samples
        Adversarial Scores: All the scores computed for the adversarial samples

    The last three columns contain the data to build the ROC curve. These are:
        Thresholds
        True Positive Rates
        False Positive Rates

    Each threshold has a corresponding TPR and FPR.
    """

    adversarial_loader = options['adversarial_loader']
    command = options['command']
    dataset_type = options['dataset_type']
    detector = options['detector']
    failure_value = options['failure_value']
    foolbox_model = options['foolbox_model']
    genuine_loader = options['loader']
    results_path = options['results_path']

    save_scores = score_dataset_path is not None

    if dataset_type == 'test' and not no_test_warning:
        logger.warning('Remember to use \'--dataset-type train\' if you plan to use the results '
                       'to pick a threshold for other tests. You can disable this warning by passing '
                       '\'--no-test-warning\'.')

    genuine_scores, adversarial_scores, genuine_samples, adversarial_samples = tests.roc_curve_test(
        foolbox_model, genuine_loader, adversarial_loader, detector, save_scores)

    false_positive_rates, true_positive_rates, thresholds = utils.roc_curve(
        genuine_scores, adversarial_scores)

    best_threshold, best_tpr, best_fpr = utils.get_best_threshold(
        true_positive_rates, false_positive_rates, thresholds)
    area_under_curve = sklearn.metrics.auc(
        false_positive_rates, true_positive_rates)

    info = [['ROC AUC', '{:2.2f}%'.format(area_under_curve * 100.0)],
            ['Best Threshold', '{:2.2e}'.format(best_threshold)],
            ['Best Threshold True Positive Rate', '{:2.2f}%'.format(best_tpr * 100.0)],
            ['Best Threshold False Positive Rate', '{:2.2f}%'.format(best_fpr * 100.0)]]

    header = ['Genuine Scores', 'Adversarial Scores',
              'Thresholds', 'True Positive Rates', 'False Positive Rates']

    true_positive_rates = ['{:2.2f}%'.format(
        true_positive_rate * 100.0) for true_positive_rate in true_positive_rates]
    false_positive_rates = ['{:2.2f}%'.format(
        false_positive_rate * 100.0) for false_positive_rate in false_positive_rates]

    columns = [genuine_scores, adversarial_scores,
               thresholds, true_positive_rates, false_positive_rates]

    utils.save_results(results_path, table=columns, command=command,
                       info=info, header=header)

    if save_scores:
        # Remove failures

        genuine_not_failed = np.not_equal(genuine_scores, failure_value)
        genuine_samples = genuine_samples[genuine_not_failed]
        genuine_scores = genuine_scores[genuine_not_failed]

        adversarial_not_failed = np.not_equal(
            adversarial_scores, failure_value)
        adversarial_samples = adversarial_samples[adversarial_not_failed]
        adversarial_scores = adversarial_scores[adversarial_not_failed]

        genuine_list = zip(genuine_samples, genuine_scores)
        adversarial_list = zip(adversarial_samples, adversarial_scores)

        dataset = (genuine_list, adversarial_list)

        utils.save_zip(dataset, score_dataset_path)
Exemplo n.º 13
0
def attack(options, adversarial_dataset_path, no_test_warning):
    """
    Runs an attack against the model.

    \b
    Stores the following results:
        Success Rate: The success rate of the attack.
        Average Distance: The average L_p distance of the successful adversarial samples from their original samples.
        Median Distance: The median L_p distance of the successful adversarial samples from their original samples.
        Adjusted Median Distance: The median L_p distance of the adversarial samples from their original samples, treating failed attacks as samples with distance Infinity.
    """

    attack_p = options['attack_p']
    attack_name = options['attack_name']
    attack_workers = options['attack_workers']
    command = options['command']
    cuda = options['cuda']
    dataset_type = options['dataset_type']
    foolbox_model = options['foolbox_model']
    loader = options['loader']
    results_path = options['results_path']

    criterion = foolbox.criteria.Misclassification()

    attack = parsing.parse_attack(attack_name, attack_p, criterion)

    save_adversarials = adversarial_dataset_path is not None

    if dataset_type == 'test' and save_adversarials and not no_test_warning:
        logger.warning(
            'Remember to use \'--dataset-type train\' if you plan to use the generated adversarials '
            'to train or calibrate an adversarial detector. You can disable this warning by passing '
            '\'--no-test-warning\'.')

    samples_count, correct_count, successful_attack_count, distances, adversarials, adversarial_ground_truths = tests.attack_test(
        foolbox_model,
        loader,
        attack,
        attack_p,
        cuda,
        attack_workers,
        save_adversarials=save_adversarials)

    accuracy = correct_count / samples_count
    success_rate = successful_attack_count / correct_count

    failure_count = correct_count - successful_attack_count
    average_distance, median_distance, adjusted_median_distance = utils.distance_statistics(
        distances, failure_count)

    info = [['Base Accuracy', '{:2.2f}%'.format(accuracy * 100.0)],
            ['Success Rate', '{:2.2f}%'.format(success_rate * 100.0)],
            ['Average Distance', '{:2.2e}'.format(average_distance)],
            ['Median Distance', '{:2.2e}'.format(median_distance)],
            [
                'Adjusted Median Distance',
                '{:2.2e}'.format(adjusted_median_distance)
            ], ['Samples Count', str(samples_count)],
            ['Correct Count', str(correct_count)],
            ['Successful Attack Count',
             str(successful_attack_count)]]

    header = ['Distances']

    utils.save_results(results_path,
                       table=[distances],
                       command=command,
                       info=info,
                       header=header)

    if save_adversarials:
        dataset = list(zip(adversarials,
                           adversarial_ground_truths)), success_rate
        utils.save_zip(dataset, adversarial_dataset_path)