示例#1
0
    def __init__(self, classifier, x_train, y_train):
        """
        Create an :class:`.ActivationDefence` object with the provided classifier.

        :param classifier: model evaluated for poison
        :type classifier: :class:`.Classifier`
        :param x_train: dataset used to train the classifier.
        :type x_train: `np.ndarray`
        :param y_train: labels used to train the classifier.
        :type y_train: `np.ndarray`
        """
        super(ActivationDefence, self).__init__(classifier, x_train, y_train)
        kwargs = {'nb_clusters': 2, 'clustering_method': "KMeans", 'nb_dims': 10, 'reduce': 'PCA',
                  'cluster_analysis': "smaller"}
        self.set_params(**kwargs)
        self.activations_by_class = []
        self.clusters_by_class = []
        self.assigned_clean_by_class = []
        self.is_clean_by_class = []
        self.errors_by_class = []
        self.red_activations_by_class = []  # Activations reduced by class
        self.evaluator = GroundTruthEvaluator()
        self.is_clean_lst = []
        self.confidence_level = []
        self.poisonous_clusters = []
    def __init__(self, classifier, x_train, y_train, verbose=True):
        """
        Create an ActivationDefence object with the provided classifier

        :param classifier: model evaluated for poison
        :type classifier: :class:`Classifier`
        :param x_train: dataset used to train `classifier`
        :type x_train: :class:`numpy.ndarray`
        :param y_train: labels used to train `classifier`
        :type y_train: :class:`numpy.ndarray`
        :param verbose: When True prints more information
        :type verbose: `bool`
        """
        super(ActivationDefence, self).__init__(classifier, x_train, y_train,
                                                verbose)
        kwargs = {
            'n_clusters': 2,
            'clustering_method': "KMeans",
            'ndims': 10,
            'reduce': 'PCA',
            'cluster_analysis': "smaller"
        }
        self.set_params(**kwargs)
        self.activations_by_class = []
        self.clusters_by_class = []
        self.assigned_clean_by_class = []
        self.is_clean_by_class = []
        self.errors_by_class = []
        self.red_activations_by_class = []  # Activations reduced by class
        self.evaluator = GroundTruthEvaluator()
        self.is_clean_lst = []
        self.confidence_level = []
示例#3
0
 def __init__(self, classifier, x_train, y_train, **kwargs):
     """
     Create an :class:`.ActivationDefence` object with the provided classifier.
     :param classifier: Model evaluated for poison.
     :param x_train: dataset used to train the classifier.
     :param y_train: labels used to train the classifier.
     """
     super(SpectralSignatureDefense, self).__init__(classifier, x_train,
                                                    y_train)
     self.set_params(**kwargs)
     self.evaluator = GroundTruthEvaluator()
    def __init__(self,
                 classifier,
                 x_train,
                 y_train,
                 x_val,
                 y_val,
                 perf_func='accuracy',
                 pp_cal=0.2,
                 pp_quiz=0.2,
                 calibrated=True,
                 eps=0.1,
                 **kwargs):
        """
        Create an :class:`.ActivationDefence` object with the provided classifier.

        :param classifier: Model evaluated for poison.
        :type classifier: :class:`art.classifiers.Classifier`
        :param x_train: dataset used to train the classifier.
        :type x_train: `np.ndarray`
        :param y_train: labels used to train the classifier.
        :type y_train: `np.ndarray`
        :param x_val: trusted data points
        :type x_val: `np.ndarray`
        :param y_train: trusted data labels
        :type y_train: `np.ndarray`
        :param perf_func: performance function to use
        :type perf_func: `str` or `callable`
        :param pp_cal: percent of training data used for calibration
        :type pp_cal: `float`
        :param pp_quiz: percent of training data used for quiz set
        :type pp_quiz: `float`
        :param calibrated: True if using the calibrated form of RONI
        :type calibrated: `bool`
        :param eps: performance threshold if using uncalibrated RONI
        :type eps: `float`
        """
        super(RONIDefense, self).__init__(classifier, x_train, y_train)
        n_points = len(x_train)
        quiz_idx = np.random.randint(n_points, size=int(pp_quiz * n_points))
        self.calibrated = calibrated
        self.x_quiz = np.copy(self.x_train[quiz_idx])
        self.y_quiz = np.copy(self.y_train[quiz_idx])
        if self.calibrated:
            _, self.x_cal, _, self.y_cal = train_test_split(self.x_train,
                                                            self.y_train,
                                                            test_size=pp_cal,
                                                            shuffle=True)
        self.eps = eps
        self.evaluator = GroundTruthEvaluator()
        self.x_val = x_val
        self.y_val = y_val
        self.perf_func = perf_func
        self.is_clean_lst = list()
        self.set_params(**kwargs)
示例#5
0
    def setUp(self):
        self.evaluator = GroundTruthEvaluator()
        self.n_classes = 3
        self.n_dp = 10
        self.n_dp_mix = 5

        self.is_clean_all_clean = [[] for i in range(self.n_classes)]
        self.is_clean_all_poison = [[] for i in range(self.n_classes)]
        self.is_clean_mixed = [[] for i in range(self.n_classes)]
        self.is_clean_comp_mix = [[] for i in range(self.n_classes)]

        for i in range(self.n_classes):
            self.is_clean_all_clean[i] = [1] * self.n_dp
            self.is_clean_all_poison[i] = [0] * self.n_dp
            self.is_clean_mixed[i] = [1, 0, 0, 1, 0, 1, 1, 1, 0, 0]
            self.is_clean_comp_mix[i] = [0, 1, 1, 0, 1, 0, 0, 0, 1, 1]
示例#6
0
    def __init__(self,
                 classifier,
                 x_train,
                 y_train,
                 p_train,
                 x_val=None,
                 y_val=None,
                 eps=0.2,
                 perf_func='accuracy',
                 pp_valid=0.2,
                 **kwargs):
        """
        Create an :class:`.ProvenanceDefense` object with the provided classifier.

        :param classifier: Model evaluated for poison.
        :type classifier: :class:`art.classifiers.Classifier`
        :param x_train: dataset used to train the classifier.
        :type x_train: `np.ndarray`
        :param y_train: labels used to train the classifier.
        :type y_train: `np.ndarray`
        :param p_train: provenance features for each training data point as one hot vectors
        :type p_train: `np.ndarray`
        :param x_val: validation data for defense (optional)
        :type x_val: `np.ndarray`
        :param y_val: validation labels for defense (optional)
        :type y_val: `np.ndarray`
        :param eps: threshold for performance shift in suspicious data
        :type eps: `float`
        :param perf_func: performance function used to evaluate effectiveness of defense
        :type eps: `str` or `callable`
        :param pp_valid: The percent of training data to use as validation data (for defense without validation data)
        :type eps: `str` or `callable`
        """
        super(ProvenanceDefense, self).__init__(classifier, x_train, y_train)
        self.p_train = p_train
        self.num_devices = self.p_train.shape[1]
        self.x_val = x_val
        self.y_val = y_val
        self.eps = eps
        self.perf_func = perf_func
        self.pp_valid = pp_valid
        self.assigned_clean_by_device = []
        self.is_clean_by_device = []
        self.errors_by_device = []
        self.evaluator = GroundTruthEvaluator()
        self.is_clean_lst = []
        self.set_params(**kwargs)
    def setUpClass(cls):
        cls.evaluator = GroundTruthEvaluator()
        cls.n_classes = 3
        cls.n_dp = 10
        cls.n_dp_mix = 5

        cls.is_clean_all_clean = [[] for _ in range(cls.n_classes)]
        cls.is_clean_all_poison = [[] for _ in range(cls.n_classes)]
        cls.is_clean_mixed = [[] for _ in range(cls.n_classes)]
        cls.is_clean_comp_mix = [[] for _ in range(cls.n_classes)]

        for i in range(cls.n_classes):
            cls.is_clean_all_clean[i] = [1] * cls.n_dp
            cls.is_clean_all_poison[i] = [0] * cls.n_dp
            cls.is_clean_mixed[i] = [1, 0, 0, 1, 0, 1, 1, 1, 0, 0]
            cls.is_clean_comp_mix[i] = [0, 1, 1, 0, 1, 0, 0, 0, 1, 1]
示例#8
0
class ActivationDefence(PoisonFilteringDefence):
    """
    Method from [Chen et al., 2018] performing poisoning detection based on activations clustering.
    Paper link: https://arxiv.org/abs/1811.03728
    """
    defence_params = ['nb_clusters', 'clustering_method', 'nb_dims', 'reduce', 'cluster_analysis']
    valid_clustering = ['KMeans']
    valid_reduce = ['PCA', 'FastICA', 'TSNE']
    valid_analysis = ['smaller', 'distance', 'relative-size', 'silhouette-scores']

    TOO_SMALL_ACTIVATIONS = 32  # Threshold used to print a warning when activations are not enough

    def __init__(self, classifier, x_train, y_train):
        """
        Create an :class:`.ActivationDefence` object with the provided classifier.

        :param classifier: model evaluated for poison
        :type classifier: :class:`.Classifier`
        :param x_train: dataset used to train the classifier.
        :type x_train: `np.ndarray`
        :param y_train: labels used to train the classifier.
        :type y_train: `np.ndarray`
        """
        super(ActivationDefence, self).__init__(classifier, x_train, y_train)
        kwargs = {'nb_clusters': 2, 'clustering_method': "KMeans", 'nb_dims': 10, 'reduce': 'PCA',
                  'cluster_analysis': "smaller"}
        self.set_params(**kwargs)
        self.activations_by_class = []
        self.clusters_by_class = []
        self.assigned_clean_by_class = []
        self.is_clean_by_class = []
        self.errors_by_class = []
        self.red_activations_by_class = []  # Activations reduced by class
        self.evaluator = GroundTruthEvaluator()
        self.is_clean_lst = []
        self.confidence_level = []
        self.poisonous_clusters = []

    def evaluate_defence(self, is_clean, **kwargs):
        """
        Returns confusion matrix.

        :param is_clean: ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means
                         x_train[i] is poisonous
        :type is_clean: :class `list`
        :param kwargs: a dictionary of defence-specific parameters
        :type kwargs: `dict`
        :return: JSON object with confusion matrix
        :rtype: `jsonObject`
        """
        if not is_clean:
            raise ValueError("is_clean was not provided while invoking evaluate_defence.")

        self.set_params(**kwargs)

        if not self.activations_by_class:
            activations = self._get_activations()
            self.activations_by_class = self._segment_by_class(activations, self.y_train)

        self.clusters_by_class, self.red_activations_by_class = self.cluster_activations()
        _, self.assigned_clean_by_class = self.analyze_clusters()

        # Now check ground truth:
        self.is_clean_by_class = self._segment_by_class(is_clean, self.y_train)
        self.errors_by_class, conf_matrix_json = self.evaluator.analyze_correctness(self.assigned_clean_by_class,
                                                                                    self.is_clean_by_class)
        return conf_matrix_json

    def detect_poison(self, **kwargs):
        """
        Returns poison detected and a report.

        :param kwargs: a dictionary of detection-specific parameters
        :type kwargs: `dict`
        :return: (report, is_clean_lst):
                where a report is a json object that contains information specified by the clustering analysis technique.
                where is_clean is a list, where is_clean_lst[i]=1 means that x_train[i]
                there is clean and is_clean_lst[i]=0, means that x_train[i] was classified as poison
        :rtype: `tuple`
        """
        self.set_params(**kwargs)

        if not self.activations_by_class:
            activations = self._get_activations()
            self.activations_by_class = self._segment_by_class(activations, self.y_train)
        self.clusters_by_class, self.red_activations_by_class = self.cluster_activations()
        report, self.assigned_clean_by_class = self.analyze_clusters()
        # Here, assigned_clean_by_class[i][j] is 1 if the jth datapoint in the ith class was
        # determined to be clean by activation cluster

        # Build an array that matches the original indexes of x_train
        n_train = len(self.x_train)
        indices_by_class = self._segment_by_class(np.arange(n_train), self.y_train)
        self.is_clean_lst = [0] * n_train

        for assigned_clean, dp in zip(self.assigned_clean_by_class, indices_by_class):
            for assignment, index_dp in zip(assigned_clean, dp):
                if assignment == 1:
                    self.is_clean_lst[index_dp] = 1

        return report, self.is_clean_lst

    def cluster_activations(self, **kwargs):
        """
        Clusters activations and returns cluster_by_class and red_activations_by_class,
        where cluster_by_class[i][j] is the cluster to which the j-th datapoint in the
        ith class belongs and the correspondent activations reduced by class
        red_activations_by_class[i][j]

        :param kwargs: a dictionary of cluster-specific parameters
        :type kwargs: `dict`
        :return: clusters per class and activations by class
        :rtype: `tuple`
        """
        self.set_params(**kwargs)
        if not self.activations_by_class:
            activations = self._get_activations()
            self.activations_by_class = self._segment_by_class(activations, self.y_train)

        [self.clusters_by_class, self.red_activations_by_class] = cluster_activations(
            self.activations_by_class,
            nb_clusters=self.nb_clusters,
            nb_dims=self.nb_dims,
            reduce=self.reduce,
            clustering_method=self.clustering_method)

        return self.clusters_by_class, self.red_activations_by_class

    def analyze_clusters(self, **kwargs):
        """
        This function analyzes the clusters according to the provided method

        :param kwargs: a dictionary of cluster-analysis-specific parameters
        :type kwargs: `dict`
        :return: (report, assigned_clean_by_class), where the report is a json object and assigned_clean_by_class
                 is an array of arrays that contains what data points where classified as clean.
        :rtype: `tuple(json, np.ndarray)`
        """
        self.set_params(**kwargs)

        if not self.clusters_by_class:
            self.cluster_activations()

        analyzer = ClusteringAnalyzer()

        if self.cluster_analysis == 'smaller':
            self.assigned_clean_by_class, self.poisonous_clusters, report \
                = analyzer.analyze_by_size(self.clusters_by_class)
        elif self.cluster_analysis == 'relative-size':
            self.assigned_clean_by_class, self.poisonous_clusters, report \
                = analyzer.analyze_by_relative_size(self.clusters_by_class)
        elif self.cluster_analysis == 'distance':
            self.assigned_clean_by_class, self.poisonous_clusters, report \
                = analyzer.analyze_by_distance(self.clusters_by_class,
                                               separated_activations=self.red_activations_by_class)
        elif self.cluster_analysis == 'silhouette-scores':
            self.assigned_clean_by_class, self.poisonous_clusters, report \
                = analyzer.analyze_by_silhouette_score(self.clusters_by_class,
                                                       reduced_activations_by_class=self.red_activations_by_class)
        else:
            raise ValueError(
                "Unsupported cluster analysis technique " + self.cluster_analysis)

        # Add to the report current parameters used to run the defence and the analysis summary
        report = dict(list(report.items()) + list(self.get_params().items()))
        import json
        jreport = json.dumps(report)

        return jreport, self.assigned_clean_by_class

    def visualize_clusters(self, x_raw, save=True, folder='.', **kwargs):
        """
        This function creates the sprite/mosaic visualization for clusters. When save=True,
        it also stores a sprite (mosaic) per cluster in DATA_PATH.

        :param x_raw: Images used to train the classifier (before pre-processing)
        :type x_raw: `np.darray`
        :param save: Boolean specifying if image should be saved
        :type  save: `bool`
        :param folder: Directory where the sprites will be saved inside DATA_PATH folder
        :type folder: `str`
        :param kwargs: a dictionary of cluster-analysis-specific parameters
        :type kwargs: `dict`
        :return: sprites_by_class: Array with sprite images sprites_by_class, where sprites_by_class[i][j] contains the
                 sprite of class i cluster j.
        :rtype: sprites_by_class: `np.ndarray`
        """
        self.set_params(**kwargs)

        if not self.clusters_by_class:
            self.cluster_activations()

        x_raw_by_class = self._segment_by_class(x_raw, self.y_train)
        x_raw_by_cluster = [[[] for x in range(self.nb_clusters)] for y in range(self.classifier.nb_classes)]

        # Get all data in x_raw in the right cluster
        for n_class, cluster in enumerate(self.clusters_by_class):
            for j, assigned_cluster in enumerate(cluster):
                x_raw_by_cluster[n_class][assigned_cluster].append(x_raw_by_class[n_class][j])

        # Now create sprites:
        sprites_by_class = [[[] for x in range(self.nb_clusters)] for y in range(self.classifier.nb_classes)]
        for i, class_i in enumerate(x_raw_by_cluster):
            for j, images_cluster in enumerate(class_i):
                title = 'Class_' + str(i) + '_cluster_' + str(j) + '_clusterSize_' + str(len(images_cluster))
                f_name = title + '.png'
                f_name = os.path.join(folder, f_name)
                sprite = create_sprite(images_cluster)
                if save:
                    save_image(sprite, f_name)
                sprites_by_class[i][j] = sprite

        return sprites_by_class

    def plot_clusters(self, save=True, folder='.', **kwargs):
        """
        Creates a 3D-plot to visualize each cluster each cluster is assigned a different color in the plot.
        When save=True, it also stores the 3D-plot per cluster in DATA_PATH.

        :param save: Boolean specifying if image should be saved
        :type  save: `bool`
        :param folder: Directory where the sprites will be saved inside DATA_PATH folder
        :type folder: `str`
        :param kwargs: a dictionary of cluster-analysis-specific parameters
        :type kwargs: `dict`
        :return: None
        """
        self.set_params(**kwargs)

        if not self.clusters_by_class:
            self.cluster_activations()

        # Get activations reduced to 3-components:
        separated_reduced_activations = []
        for ac in self.activations_by_class:
            reduced_activations = reduce_dimensionality(ac, nb_dims=3)
            separated_reduced_activations.append(reduced_activations)

        # For each class generate a plot:
        for class_id, (labels, coordinates) in enumerate(zip(self.clusters_by_class, separated_reduced_activations)):
            f_name = ''
            if save:
                f_name = os.path.join(folder, 'plot_class_' + str(class_id) + '.png')
            plot_3d(coordinates, labels, save=save, f_name=f_name)

    def set_params(self, **kwargs):
        """
        Take in a dictionary of parameters and applies defence-specific checks before saving them as attributes.
        If a parameter is not provided, it takes its default value.

        :param nb_clusters: Number of clusters to be produced. Should be greater than 2.
        :type nb_clusters: `int`
        :param clustering_method: Clustering method to use
        :type clustering_method: `str`
        :param nb_dims: Number of dimensions to project on
        :type nb_dims: `int`
        :param reduce: Reduction technique
        :type reduce: `str`
        :param cluster_analysis: Method to analyze the clusters
        :type cluster_analysis: `str`
        """
        # Save defence-specific parameters
        super(ActivationDefence, self).set_params(**kwargs)

        if self.nb_clusters <= 1:
            raise ValueError(
                "Wrong number of clusters, should be greater or equal to 2. Provided: " + str(self.nb_clusters))
        if self.nb_dims <= 0:
            raise ValueError("Wrong number of dimensions ")
        if self.clustering_method not in self.valid_clustering:
            raise ValueError("Unsupported clustering method: " + self.clustering_method)
        if self.reduce not in self.valid_reduce:
            raise ValueError("Unsupported reduction method: " + self.reduce)
        if self.cluster_analysis not in self.valid_analysis:
            raise ValueError("Unsupported method for cluster analysis method: " + self.cluster_analysis)

        return True

    def _get_activations(self):
        """
        Find activations from :class:`.Classifier`.
        """
        logger.info('Getting activations')

        nb_layers = len(self.classifier.layer_names)
        activations = self.classifier.get_activations(self.x_train, layer=nb_layers - 1)

        # wrong way to get activations activations = self.classifier.predict(self.x_train, logits=True)
        nodes_last_layer = np.shape(activations)[1]

        if nodes_last_layer <= self.TOO_SMALL_ACTIVATIONS:
            logger.warning("Number of activations in last hidden layer is too small. Method may not work properly. "
                           "Size: %s", str(nodes_last_layer))
        return activations

    def _segment_by_class(self, data, features):
        """
        Returns segmented data according to specified features.

        :param data: to be segmented
        :type data: `np.ndarray`
        :param features: features used to segment data, e.g., segment according to predicted label or to `y_train`
        :type features: `np.ndarray`
        :return: segmented data according to specified features.
        :rtype: `list`
        """
        n_classes = self.classifier.nb_classes
        by_class = [[] for _ in range(n_classes)]
        for indx, feature in enumerate(features):
            if n_classes > 2:
                assigned = np.argmax(feature)
            else:
                assigned = int(feature)
            by_class[assigned].append(data[indx])

        return [np.asarray(i) for i in by_class]
示例#9
0
class TestGroundTruth(unittest.TestCase):
    def setUp(self):
        self.evaluator = GroundTruthEvaluator()
        self.n_classes = 3
        self.n_dp = 10
        self.n_dp_mix = 5

        self.is_clean_all_clean = [[] for i in range(self.n_classes)]
        self.is_clean_all_poison = [[] for i in range(self.n_classes)]
        self.is_clean_mixed = [[] for i in range(self.n_classes)]
        self.is_clean_comp_mix = [[] for i in range(self.n_classes)]

        for i in range(self.n_classes):
            self.is_clean_all_clean[i] = [1] * self.n_dp
            self.is_clean_all_poison[i] = [0] * self.n_dp
            self.is_clean_mixed[i] = [1, 0, 0, 1, 0, 1, 1, 1, 0, 0]
            self.is_clean_comp_mix[i] = [0, 1, 1, 0, 1, 0, 0, 0, 1, 1]

    def test_analyze_correct_all_clean(self):
        # perfect detection all data is actually clean:
        errors_by_class, conf_matrix_json = self.evaluator.analyze_correctness(
            self.is_clean_all_clean, self.is_clean_all_clean)

        json_object = json.loads(conf_matrix_json)
        self.assertEqual(len(json_object.keys()), self.n_classes)
        self.assertEqual(len(errors_by_class), self.n_classes)

        # print(json_object)
        for i in range(self.n_classes):
            res_class_i = json_object['class_' + str(i)]
            self.assertEqual(res_class_i['TruePositive']['rate'], 'N/A')
            self.assertEqual(res_class_i['TrueNegative']['rate'], 100)
            self.assertEqual(res_class_i['FalseNegative']['rate'], 'N/A')
            self.assertEqual(res_class_i['FalsePositive']['rate'], 0)

            self.assertEqual(res_class_i['TruePositive']['numerator'], 0)
            self.assertEqual(res_class_i['TruePositive']['denominator'], 0)

            self.assertEqual(res_class_i['TrueNegative']['numerator'],
                             self.n_dp)
            self.assertEqual(res_class_i['TrueNegative']['denominator'],
                             self.n_dp)

            self.assertEqual(res_class_i['FalseNegative']['numerator'], 0)
            self.assertEqual(res_class_i['FalseNegative']['denominator'], 0)

            self.assertEqual(res_class_i['FalsePositive']['numerator'], 0)
            self.assertEqual(res_class_i['FalsePositive']['denominator'],
                             self.n_dp)

            # all errors_by_class should be 1 (errors_by_class[i] = 1 if marked clean, is clean)
            for item in errors_by_class[i]:
                self.assertEqual(item, 1)

    def test_analyze_correct_all_poison(self):
        # perfect detection all data is actually poison
        errors_by_class, conf_matrix_json = self.evaluator.analyze_correctness(
            self.is_clean_all_poison, self.is_clean_all_poison)

        json_object = json.loads(conf_matrix_json)
        self.assertEqual(len(json_object.keys()), self.n_classes)
        self.assertEqual(len(errors_by_class), self.n_classes)

        # print(json_object)
        for i in range(self.n_classes):
            res_class_i = json_object['class_' + str(i)]
            self.assertEqual(res_class_i['TruePositive']['rate'], 100)
            self.assertEqual(res_class_i['TrueNegative']['rate'], 'N/A')
            self.assertEqual(res_class_i['FalseNegative']['rate'], 0)
            self.assertEqual(res_class_i['FalsePositive']['rate'], 'N/A')

            self.assertEqual(res_class_i['TruePositive']['numerator'],
                             self.n_dp)
            self.assertEqual(res_class_i['TruePositive']['denominator'],
                             self.n_dp)

            self.assertEqual(res_class_i['TrueNegative']['numerator'], 0)
            self.assertEqual(res_class_i['TrueNegative']['denominator'], 0)

            self.assertEqual(res_class_i['FalseNegative']['numerator'], 0)
            self.assertEqual(res_class_i['FalseNegative']['denominator'],
                             self.n_dp)

            self.assertEqual(res_class_i['FalsePositive']['numerator'], 0)
            self.assertEqual(res_class_i['FalsePositive']['denominator'], 0)

            # all errors_by_class should be 0 (all_errors_by_class[i] = 0 if marked poison, is poison)
            for item in errors_by_class[i]:
                self.assertEqual(item, 0)

    def test_analyze_correct_mixed(self):
        # perfect detection mixed
        errors_by_class, conf_matrix_json = self.evaluator.analyze_correctness(
            self.is_clean_mixed, self.is_clean_mixed)

        json_object = json.loads(conf_matrix_json)
        self.assertEqual(len(json_object.keys()), self.n_classes)
        self.assertEqual(len(errors_by_class), self.n_classes)

        # print(json_object)
        for i in range(self.n_classes):
            res_class_i = json_object['class_' + str(i)]
            self.assertEqual(res_class_i['TruePositive']['rate'], 100)
            self.assertEqual(res_class_i['TrueNegative']['rate'], 100)
            self.assertEqual(res_class_i['FalseNegative']['rate'], 0)
            self.assertEqual(res_class_i['FalsePositive']['rate'], 0)

            self.assertEqual(res_class_i['TruePositive']['numerator'],
                             self.n_dp_mix)
            self.assertEqual(res_class_i['TruePositive']['denominator'],
                             self.n_dp_mix)

            self.assertEqual(res_class_i['TrueNegative']['numerator'],
                             self.n_dp_mix)
            self.assertEqual(res_class_i['TrueNegative']['denominator'],
                             self.n_dp_mix)

            self.assertEqual(res_class_i['FalseNegative']['numerator'], 0)
            self.assertEqual(res_class_i['FalseNegative']['denominator'],
                             self.n_dp_mix)

            self.assertEqual(res_class_i['FalsePositive']['numerator'], 0)
            self.assertEqual(res_class_i['FalsePositive']['denominator'],
                             self.n_dp_mix)

            # all errors_by_class should be 1 (errors_by_class[i] = 1 if marked clean, is clean)
            for j, item in enumerate(errors_by_class[i]):
                self.assertEqual(item, self.is_clean_mixed[i][j])

    def test_analyze_fully_misclassified(self):
        # Completely wrong
        # order parameters: analyze_correctness(assigned_clean_by_class, is_clean_by_class)
        errors_by_class, conf_matrix_json = self.evaluator.analyze_correctness(
            self.is_clean_all_clean, self.is_clean_all_poison)

        json_object = json.loads(conf_matrix_json)
        self.assertEqual(len(json_object.keys()), self.n_classes)
        self.assertEqual(len(errors_by_class), self.n_classes)

        print(json_object)
        for i in range(self.n_classes):
            res_class_i = json_object['class_' + str(i)]
            self.assertEqual(res_class_i['TruePositive']['rate'], 0)
            self.assertEqual(res_class_i['TrueNegative']['rate'], 'N/A')
            self.assertEqual(res_class_i['FalseNegative']['rate'], 100)
            self.assertEqual(res_class_i['FalsePositive']['rate'], 'N/A')

            self.assertEqual(res_class_i['TruePositive']['numerator'], 0)
            self.assertEqual(res_class_i['TruePositive']['denominator'],
                             self.n_dp)

            self.assertEqual(res_class_i['TrueNegative']['numerator'], 0)
            self.assertEqual(res_class_i['TrueNegative']['denominator'], 0)

            self.assertEqual(res_class_i['FalseNegative']['numerator'],
                             self.n_dp)
            self.assertEqual(res_class_i['FalseNegative']['denominator'],
                             self.n_dp)

            self.assertEqual(res_class_i['FalsePositive']['numerator'], 0)
            self.assertEqual(res_class_i['FalsePositive']['denominator'], 0)

            # all errors_by_class should be 3 (all_errors_by_class[i] = 3 marked clean, is poison)
            for item in errors_by_class[i]:
                self.assertEqual(item, 3)

    def test_analyze_fully_misclassified_rev(self):
        # Completely wrong
        # order parameters: analyze_correctness(assigned_clean_by_class, is_clean_by_class)
        errors_by_class, conf_matrix_json = self.evaluator.analyze_correctness(
            self.is_clean_all_poison, self.is_clean_all_clean)

        json_object = json.loads(conf_matrix_json)
        self.assertEqual(len(json_object.keys()), self.n_classes)
        self.assertEqual(len(errors_by_class), self.n_classes)

        pprint.pprint(json_object)
        for i in range(self.n_classes):
            res_class_i = json_object['class_' + str(i)]
            self.assertEqual(res_class_i['TruePositive']['rate'], 'N/A')
            self.assertEqual(res_class_i['TrueNegative']['rate'], 0)
            self.assertEqual(res_class_i['FalseNegative']['rate'], 'N/A')
            self.assertEqual(res_class_i['FalsePositive']['rate'], 100)

            self.assertEqual(res_class_i['TruePositive']['numerator'], 0)
            self.assertEqual(res_class_i['TruePositive']['denominator'], 0)

            self.assertEqual(res_class_i['TrueNegative']['numerator'], 0)
            self.assertEqual(res_class_i['TrueNegative']['denominator'],
                             self.n_dp)

            self.assertEqual(res_class_i['FalseNegative']['numerator'], 0)
            self.assertEqual(res_class_i['FalseNegative']['denominator'], 0)

            self.assertEqual(res_class_i['FalsePositive']['numerator'],
                             self.n_dp)
            self.assertEqual(res_class_i['FalsePositive']['denominator'],
                             self.n_dp)

            # all errors_by_class should be 3 (all_errors_by_class[i] = 2 if marked poison, is clean)
            for item in errors_by_class[i]:
                self.assertEqual(item, 2)
示例#10
0
class SpectralSignatureDefense(PoisonFilteringDefence):
    """
    Method from Tran et al., 2018 performing poisoning detection based on Spectral Signatures
    """

    defence_params = PoisonFilteringDefence.defence_params + [
        "classifier",
        "x_train",
        "y_train",
        "batch_size",
        "eps_multiplier",
        "ub_pct_poison",
    ]

    def __init__(self, classifier, x_train, y_train, **kwargs):
        """
        Create an :class:`.ActivationDefence` object with the provided classifier.
        :param classifier: Model evaluated for poison.
        :param x_train: dataset used to train the classifier.
        :param y_train: labels used to train the classifier.
        """
        super(SpectralSignatureDefense, self).__init__(classifier, x_train,
                                                       y_train)
        self.set_params(**kwargs)
        self.evaluator = GroundTruthEvaluator()

    def evaluate_defence(self, is_clean, **kwargs):
        """
        If ground truth is known, this function returns a confusion matrix in the form of a JSON object.
        :param is_clean: Ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means
                         x_train[i] is poisonous.
        :param kwargs: A dictionary of defence-specific parameters.
        :return: JSON object with confusion matrix.
        """

        n_classes = self.classifier.nb_classes()
        if is_clean is None or is_clean.size == 0:
            raise ValueError(
                "is_clean was not provided while invoking evaluate_defence.")
        is_clean_by_class = SpectralSignatureDefense.split_by_class(
            is_clean, self.y_train, n_classes)
        _, predicted_clean = self.detect_poison()
        predicted_clean_by_class = SpectralSignatureDefense.split_by_class(
            predicted_clean, self.y_train, n_classes)

        _, conf_matrix_json = self.evaluator.analyze_correctness(
            predicted_clean_by_class, is_clean_by_class)

        return conf_matrix_json

    def detect_poison(self, **kwargs):
        """
        Returns poison detected and a report.
        :return: (report, is_clean_lst):
                where a report is None (for future ART compatibility)
                where is_clean is a list, where is_clean_lst[i]=1 means that x_train[i]
                there is clean and is_clean_lst[i]=0, means that x_train[i] was classified as poison.
        """

        self.set_params(**kwargs)

        n_classes = self.classifier.nb_classes()
        nb_layers = len(self.classifier.layer_names)

        features_x_poisoned = self.classifier.get_activations(
            self.x_train, layer=nb_layers - 1, batch_size=self.batch_size)

        features_split = SpectralSignatureDefense.split_by_class(
            features_x_poisoned, self.y_train, n_classes)
        keep_by_class = []
        for idx, feature in enumerate(features_split):
            score = SpectralSignatureDefense.spectral_signature_scores(feature)
            score_cutoff = np.quantile(
                score, max(1 - self.eps_multiplier * self.ub_pct_poison, 0.0))
            keep_by_class.append(score < score_cutoff)

        base_indices_by_class = SpectralSignatureDefense.split_by_class(
            np.arange(self.y_train.shape[0]), self.y_train, 10)
        is_clean_lst = np.zeros_like(self.y_train, dtype=np.int)

        for keep_booleans, indices in zip(keep_by_class,
                                          base_indices_by_class):
            for keep_boolean, idx in zip(keep_booleans, indices):
                if keep_boolean:
                    is_clean_lst[idx] = 1

        return None, is_clean_lst

    @staticmethod
    def spectral_signature_scores(R):
        """
        :param R: Matrix of feature representations
        :return: Outlier scores for each observation based on spectral signature
        """
        M = R - np.mean(R, axis=0)
        # Following Algorithm #1, use SVD of centered features, not of covariance
        _, _, v = np.linalg.svd(M, full_matrices=False)
        eigs = v[:1]
        score = np.matmul(M, np.transpose(eigs))**2
        return score

    @staticmethod
    def split_by_class(data, labels, num_classes):
        """
        :param data: Iterable of features
        :param labels: Labels, not in one-hot representations
        :param num_classes: Number of classes of labels
        :return: List of numpy arrays of features split by labels
        """
        split = [[] for _ in range(num_classes)]
        for idx, label in enumerate(labels):
            split[int(label)].append(data[idx])
        return [np.asarray(dat) for dat in split]

    def set_params(self, **kwargs):
        """
        Take in a dictionary of parameters and applies defense-specific checks before saving them as attributes.
        If a parameter is not provided, it takes its default value.
        """
        # Save defence-specific parameters
        super(SpectralSignatureDefense, self).set_params(**kwargs)

        return True
class ActivationDefence(PoisonFilteringDefence):
    """
    Method from [Chen et al., 2018] performing poisoning detection based on activations clustering.
    Paper link: https://arxiv.org/abs/1811.03728
    """
    defence_params = [
        'nb_clusters', 'clustering_method', 'nb_dims', 'reduce',
        'cluster_analysis'
    ]
    valid_clustering = ['KMeans']
    valid_reduce = ['PCA', 'FastICA', 'TSNE']
    valid_analysis = [
        'smaller', 'distance', 'relative-size', 'silhouette-scores'
    ]

    TOO_SMALL_ACTIVATIONS = 32  # Threshold used to print a warning when activations are not enough

    def __init__(self, classifier, x_train, y_train):
        """
        Create an :class:`.ActivationDefence` object with the provided classifier.

        :param classifier: Model evaluated for poison.
        :type classifier: :class:`.Classifier`
        :param x_train: dataset used to train the classifier.
        :type x_train: `np.ndarray`
        :param y_train: labels used to train the classifier.
        :type y_train: `np.ndarray`
        """
        super(ActivationDefence, self).__init__(classifier, x_train, y_train)
        kwargs = {
            'nb_clusters': 2,
            'clustering_method': "KMeans",
            'nb_dims': 10,
            'reduce': 'PCA',
            'cluster_analysis': "smaller"
        }
        self.set_params(**kwargs)
        self.activations_by_class = []
        self.clusters_by_class = []
        self.assigned_clean_by_class = []
        self.is_clean_by_class = []
        self.errors_by_class = []
        self.red_activations_by_class = []  # Activations reduced by class
        self.evaluator = GroundTruthEvaluator()
        self.is_clean_lst = []
        self.confidence_level = []
        self.poisonous_clusters = []

    def evaluate_defence(self, is_clean, **kwargs):
        """
        Returns confusion matrix.

        :param is_clean: Ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means
                         x_train[i] is poisonous.
        :type is_clean: :class `np.ndarray`
        :param kwargs: A dictionary of defence-specific parameters.
        :type kwargs: `dict`
        :return: JSON object with confusion matrix.
        :rtype: `jsonObject`
        """
        if is_clean is None or len(is_clean) == 0:
            raise ValueError(
                "is_clean was not provided while invoking evaluate_defence.")

        self.set_params(**kwargs)

        if not self.activations_by_class:
            activations = self._get_activations()
            self.activations_by_class = self._segment_by_class(
                activations, self.y_train)

        self.clusters_by_class, self.red_activations_by_class = self.cluster_activations(
        )
        _, self.assigned_clean_by_class = self.analyze_clusters()

        # Now check ground truth:
        self.is_clean_by_class = self._segment_by_class(is_clean, self.y_train)
        self.errors_by_class, conf_matrix_json = self.evaluator.analyze_correctness(
            self.assigned_clean_by_class, self.is_clean_by_class)
        return conf_matrix_json

    def detect_poison(self, **kwargs):
        """
        Returns poison detected and a report.

        :param kwargs: A dictionary of detection-specific parameters.
        :type kwargs: `dict`
        :return: (report, is_clean_lst):
                where a report is a dict object that contains information specified by the clustering analysis technique.
                where is_clean is a list, where is_clean_lst[i]=1 means that x_train[i]
                there is clean and is_clean_lst[i]=0, means that x_train[i] was classified as poison.
        :rtype: `tuple`
        """
        self.set_params(**kwargs)

        if not self.activations_by_class:
            activations = self._get_activations()
            self.activations_by_class = self._segment_by_class(
                activations, self.y_train)
        self.clusters_by_class, self.red_activations_by_class = self.cluster_activations(
        )
        report, self.assigned_clean_by_class = self.analyze_clusters()
        # Here, assigned_clean_by_class[i][j] is 1 if the jth datapoint in the ith class was
        # determined to be clean by activation cluster

        # Build an array that matches the original indexes of x_train
        n_train = len(self.x_train)
        indices_by_class = self._segment_by_class(np.arange(n_train),
                                                  self.y_train)
        self.is_clean_lst = [0] * n_train

        for assigned_clean, dp in zip(self.assigned_clean_by_class,
                                      indices_by_class):
            for assignment, index_dp in zip(assigned_clean, dp):
                if assignment == 1:
                    self.is_clean_lst[index_dp] = 1

        return report, self.is_clean_lst

    def cluster_activations(self, **kwargs):
        """
        Clusters activations and returns cluster_by_class and red_activations_by_class,
        where cluster_by_class[i][j] is the cluster to which the j-th datapoint in the
        ith class belongs and the correspondent activations reduced by class
        red_activations_by_class[i][j].

        :param kwargs: A dictionary of cluster-specific parameters.
        :type kwargs: `dict`
        :return: Clusters per class and activations by class.
        :rtype: `tuple`
        """
        self.set_params(**kwargs)
        if not self.activations_by_class:
            activations = self._get_activations()
            self.activations_by_class = self._segment_by_class(
                activations, self.y_train)

        [self.clusters_by_class, self.red_activations_by_class
         ] = cluster_activations(self.activations_by_class,
                                 nb_clusters=self.nb_clusters,
                                 nb_dims=self.nb_dims,
                                 reduce=self.reduce,
                                 clustering_method=self.clustering_method)

        return self.clusters_by_class, self.red_activations_by_class

    def analyze_clusters(self, **kwargs):
        """
        This function analyzes the clusters according to the provided method.

        :param kwargs: A dictionary of cluster-analysis-specific parameters.
        :type kwargs: `dict`
        :return: (report, assigned_clean_by_class), where the report is a dict object and assigned_clean_by_class
                 is an array of arrays that contains what data points where classified as clean.
        :rtype: `tuple(dict, np.ndarray)`
        """
        self.set_params(**kwargs)

        if not self.clusters_by_class:
            self.cluster_activations()

        analyzer = ClusteringAnalyzer()

        if self.cluster_analysis == 'smaller':
            self.assigned_clean_by_class, self.poisonous_clusters, report \
                = analyzer.analyze_by_size(self.clusters_by_class)
        elif self.cluster_analysis == 'relative-size':
            self.assigned_clean_by_class, self.poisonous_clusters, report \
                = analyzer.analyze_by_relative_size(self.clusters_by_class)
        elif self.cluster_analysis == 'distance':
            self.assigned_clean_by_class, self.poisonous_clusters, report \
                = analyzer.analyze_by_distance(self.clusters_by_class,
                                               separated_activations=self.red_activations_by_class)
        elif self.cluster_analysis == 'silhouette-scores':
            self.assigned_clean_by_class, self.poisonous_clusters, report \
                = analyzer.analyze_by_silhouette_score(self.clusters_by_class,
                                                       reduced_activations_by_class=self.red_activations_by_class)
        else:
            raise ValueError("Unsupported cluster analysis technique " +
                             self.cluster_analysis)

        # Add to the report current parameters used to run the defence and the analysis summary
        report = dict(list(report.items()) + list(self.get_params().items()))

        return report, self.assigned_clean_by_class

    @staticmethod
    def relabel_poison_ground_truth(classifier,
                                    x,
                                    y_fix,
                                    test_set_split=0.7,
                                    tolerable_backdoor=0.01,
                                    max_epochs=50,
                                    batch_epochs=10):
        """
        Revert poison attack by continue training the current classifier with `x`, `y_fix`.
        `test_set_split` determines the percentage in x that will be used as training set, while `1-test_set_split`
        determines how many data points to use for test set.

        :param classifier: Classifier to be fixed
        :type classifier: :class:`.Classifier`
        :param x: samples
        :type x: `np.ndarray`
        :param y_fix: true label of x_poison
        :type y_fix: `np.ndarray`
        :param test_set_split: this parameter determine how much data goes to the training set.
               Here `test_set_split*len(y_fix)` determines the number of data points in `x_train`
               and `(1-test_set_split) * len(y_fix)` the number of data points in `x_test`.
        :param tolerable_backdoor: Threshold that determines what is the maximum tolerable backdoor success rate.
        :type tolerable_backdoor: `float`
        :param max_epochs: Maximum number of epochs that the model will be trained
        :type max_epochs: `int`
        :param batch_epochs: Number of epochs to be trained before checking current state of model
        :type batch_epochs: `int`
        :return: (improve_factor, classifier)
        :rtype: `float`, `.Classifier`
        """
        # Split data into testing and training:
        n_train = int(len(x) * test_set_split)
        x_train, x_test = x[:n_train], x[n_train:]
        y_train, y_test = y_fix[:n_train], y_fix[n_train:]

        import time
        filename = 'original_classifier' + str(time.time()) + '.p'
        ActivationDefence._pickle_classifier(classifier, filename)

        # Now train using y_fix:
        improve_factor, fixed_classifier = train_remove_backdoor(
            classifier,
            x_train,
            y_train,
            x_test,
            y_test,
            tolerable_backdoor=tolerable_backdoor,
            max_epochs=max_epochs,
            batch_epochs=batch_epochs)
        # Only update classifier if there was an improvement:
        if improve_factor < 0:
            classifier = ActivationDefence._unpickle_classifier(filename)
            return 0, classifier

        ActivationDefence._remove_pickle(filename)
        return improve_factor, classifier

    @staticmethod
    def relabel_poison_cross_validation(classifier,
                                        x,
                                        y_fix,
                                        n_splits=10,
                                        tolerable_backdoor=0.01,
                                        max_epochs=50,
                                        batch_epochs=10):
        """
        Revert poison attack by continue training the current classifier with `x`, `y_fix`.
        `n_splits` determine the number of cross validation splits.

        :param classifier: Classifier to be fixed
        :type classifier: :class:`.Classifier`
        :param x: Samples that were miss-labeled.
        :type x: `np.ndarray`
        :param y_fix: True label of `x`.
        :type y_fix: `np.ndarray`
        :param n_splits: Determines how many splits to use in cross validation (only used if `cross_validation=True`).
        :type n_splits: `int`
        :param tolerable_backdoor: Threshold that determines what is the maximum tolerable backdoor success rate.
        :type tolerable_backdoor: `float`
        :param max_epochs: Maximum number of epochs that the model will be trained.
        :type max_epochs: `int`
        :param batch_epochs: Number of epochs to be trained before checking current state of model.
        :type batch_epochs: `int`
        :return: (improve_factor, classifier)
        :rtype: `float`, `.Classifier`
        """

        # Train using cross validation
        from sklearn.model_selection import KFold
        kf = KFold(n_splits=n_splits)
        KFold(n_splits=n_splits, random_state=None, shuffle=True)

        import time
        filename = 'original_classifier' + str(time.time()) + '.p'
        ActivationDefence._pickle_classifier(classifier, filename)
        curr_improvement = 0

        for i, (train_index, test_index) in enumerate(kf.split(x)):
            # Obtain partition:
            x_train, x_test = x[train_index], x[test_index]
            y_train, y_test = y_fix[train_index], y_fix[test_index]
            # Unpickle original model:
            curr_classifier = ActivationDefence._unpickle_classifier(filename)

            new_improvement, fixed_classifier = train_remove_backdoor(
                curr_classifier,
                x_train,
                y_train,
                x_test,
                y_test,
                tolerable_backdoor=tolerable_backdoor,
                max_epochs=max_epochs,
                batch_epochs=batch_epochs)
            if curr_improvement < new_improvement and new_improvement > 0:
                curr_improvement = new_improvement
                classifier = fixed_classifier
                logger.info('Selected as best model so far: ' +
                            str(curr_improvement))

        ActivationDefence._remove_pickle(filename)
        return curr_improvement, classifier

    @staticmethod
    def _pickle_classifier(classifier, file_name):
        """
        Pickles the self.classifier and stores it using the provided file_name in folder `art.DATA_PATH`.

        :param classifier: Classifier to be pickled.
        :type classifier: :class:`.Classifier`
        :param file_name: Name of the file where the classifier will be pickled
        :return: None
        """

        import pickle
        import os
        from art import DATA_PATH
        full_path = os.path.join(DATA_PATH, file_name)
        folder = os.path.split(full_path)[0]
        if not os.path.exists(folder):
            os.makedirs(folder)

        with open(full_path, 'wb') as f:
            pickle.dump(classifier, f)

    @staticmethod
    def _unpickle_classifier(file_name):
        """
        Unpickles classifier using the filename provided. Function assumes that the pickle is in `art.DATA_PATH`.
        
        :param file_name:
        :return:
        """
        import os
        from art import DATA_PATH
        import pickle

        full_path = os.path.join(DATA_PATH, file_name)
        logger.info('Loading classifier from ' + str(full_path))
        with open(full_path, 'rb') as f:
            loaded_classifier = pickle.load(f)
            return loaded_classifier

    @staticmethod
    def _remove_pickle(file_name):
        """
        Erases the pickle with the provided file name

        :param file_name: File name without directory
        :return: None
        """
        import os
        from art import DATA_PATH
        full_path = os.path.join(DATA_PATH, file_name)
        os.remove(full_path)

    def visualize_clusters(self, x_raw, save=True, folder='.', **kwargs):
        """
        This function creates the sprite/mosaic visualization for clusters. When save=True,
        it also stores a sprite (mosaic) per cluster in DATA_PATH.

        :param x_raw: Images used to train the classifier (before pre-processing)
        :type x_raw: `np.darray`
        :param save: Boolean specifying if image should be saved
        :type  save: `bool`
        :param folder: Directory where the sprites will be saved inside DATA_PATH folder
        :type folder: `str`
        :param kwargs: a dictionary of cluster-analysis-specific parameters
        :type kwargs: `dict`
        :return: sprites_by_class: Array with sprite images sprites_by_class, where sprites_by_class[i][j] contains the
                 sprite of class i cluster j.
        :rtype: sprites_by_class: `np.ndarray`
        """
        self.set_params(**kwargs)

        if not self.clusters_by_class:
            self.cluster_activations()

        x_raw_by_class = self._segment_by_class(x_raw, self.y_train)
        x_raw_by_cluster = [[[] for x in range(self.nb_clusters)]
                            for y in range(self.classifier.nb_classes)]

        # Get all data in x_raw in the right cluster
        for n_class, cluster in enumerate(self.clusters_by_class):
            for j, assigned_cluster in enumerate(cluster):
                x_raw_by_cluster[n_class][assigned_cluster].append(
                    x_raw_by_class[n_class][j])

        # Now create sprites:
        sprites_by_class = [[[] for x in range(self.nb_clusters)]
                            for y in range(self.classifier.nb_classes)]
        for i, class_i in enumerate(x_raw_by_cluster):
            for j, images_cluster in enumerate(class_i):
                title = 'Class_' + str(i) + '_cluster_' + str(
                    j) + '_clusterSize_' + str(len(images_cluster))
                f_name = title + '.png'
                f_name = os.path.join(folder, f_name)
                sprite = create_sprite(images_cluster)
                if save:
                    save_image(sprite, f_name)
                sprites_by_class[i][j] = sprite

        return sprites_by_class

    def plot_clusters(self, save=True, folder='.', **kwargs):
        """
        Creates a 3D-plot to visualize each cluster each cluster is assigned a different color in the plot.
        When save=True, it also stores the 3D-plot per cluster in DATA_PATH.

        :param save: Boolean specifying if image should be saved
        :type  save: `bool`
        :param folder: Directory where the sprites will be saved inside DATA_PATH folder
        :type folder: `str`
        :param kwargs: a dictionary of cluster-analysis-specific parameters
        :type kwargs: `dict`
        :return: None
        """
        self.set_params(**kwargs)

        if not self.clusters_by_class:
            self.cluster_activations()

        # Get activations reduced to 3-components:
        separated_reduced_activations = []
        for ac in self.activations_by_class:
            reduced_activations = reduce_dimensionality(ac, nb_dims=3)
            separated_reduced_activations.append(reduced_activations)

        # For each class generate a plot:
        for class_id, (labels, coordinates) in enumerate(
                zip(self.clusters_by_class, separated_reduced_activations)):
            f_name = ''
            if save:
                f_name = os.path.join(folder,
                                      'plot_class_' + str(class_id) + '.png')
            plot_3d(coordinates, labels, save=save, f_name=f_name)

    def set_params(self, **kwargs):
        """
        Take in a dictionary of parameters and applies defence-specific checks before saving them as attributes.
        If a parameter is not provided, it takes its default value.

        :param nb_clusters: Number of clusters to be produced. Should be greater than 2.
        :type nb_clusters: `int`
        :param clustering_method: Clustering method to use
        :type clustering_method: `str`
        :param nb_dims: Number of dimensions to project on
        :type nb_dims: `int`
        :param reduce: Reduction technique
        :type reduce: `str`
        :param cluster_analysis: Method to analyze the clusters
        :type cluster_analysis: `str`
        """
        # Save defence-specific parameters
        super(ActivationDefence, self).set_params(**kwargs)

        if self.nb_clusters <= 1:
            raise ValueError(
                "Wrong number of clusters, should be greater or equal to 2. Provided: "
                + str(self.nb_clusters))
        if self.nb_dims <= 0:
            raise ValueError("Wrong number of dimensions ")
        if self.clustering_method not in self.valid_clustering:
            raise ValueError("Unsupported clustering method: " +
                             self.clustering_method)
        if self.reduce not in self.valid_reduce:
            raise ValueError("Unsupported reduction method: " + self.reduce)
        if self.cluster_analysis not in self.valid_analysis:
            raise ValueError(
                "Unsupported method for cluster analysis method: " +
                self.cluster_analysis)

        return True

    def _get_activations(self):
        """
        Find activations from :class:`.Classifier`.
        """
        logger.info('Getting activations')

        nb_layers = len(self.classifier.layer_names)
        activations = self.classifier.get_activations(self.x_train,
                                                      layer=nb_layers - 1)

        # wrong way to get activations activations = self.classifier.predict(self.x_train, logits=True)
        nodes_last_layer = np.shape(activations)[1]

        if nodes_last_layer <= self.TOO_SMALL_ACTIVATIONS:
            logger.warning(
                "Number of activations in last hidden layer is too small. Method may not work properly. "
                "Size: %s", str(nodes_last_layer))
        return activations

    def _segment_by_class(self, data, features):
        """
        Returns segmented data according to specified features.

        :param data: to be segmented
        :type data: `np.ndarray`
        :param features: features used to segment data, e.g., segment according to predicted label or to `y_train`
        :type features: `np.ndarray`
        :return: segmented data according to specified features.
        :rtype: `list`
        """
        n_classes = self.classifier.nb_classes
        by_class = [[] for _ in range(n_classes)]
        for indx, feature in enumerate(features):
            if n_classes > 2:
                assigned = np.argmax(feature)
            else:
                assigned = int(feature)
            by_class[assigned].append(data[indx])

        return [np.asarray(i) for i in by_class]
示例#12
0
class ProvenanceDefense(PoisonFilteringDefence):
    """
    Implements methods performing poisoning detection based on data provenance.

    | Paper link: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8473440
    """

    defence_params = [
        'classifier', 'x_train', 'y_train', 'p_train', 'x_val', 'y_val', 'eps',
        'perf_func', 'pp_valid'
    ]

    def __init__(self,
                 classifier,
                 x_train,
                 y_train,
                 p_train,
                 x_val=None,
                 y_val=None,
                 eps=0.2,
                 perf_func='accuracy',
                 pp_valid=0.2,
                 **kwargs):
        """
        Create an :class:`.ProvenanceDefense` object with the provided classifier.

        :param classifier: Model evaluated for poison.
        :type classifier: :class:`art.classifiers.Classifier`
        :param x_train: dataset used to train the classifier.
        :type x_train: `np.ndarray`
        :param y_train: labels used to train the classifier.
        :type y_train: `np.ndarray`
        :param p_train: provenance features for each training data point as one hot vectors
        :type p_train: `np.ndarray`
        :param x_val: validation data for defense (optional)
        :type x_val: `np.ndarray`
        :param y_val: validation labels for defense (optional)
        :type y_val: `np.ndarray`
        :param eps: threshold for performance shift in suspicious data
        :type eps: `float`
        :param perf_func: performance function used to evaluate effectiveness of defense
        :type eps: `str` or `callable`
        :param pp_valid: The percent of training data to use as validation data (for defense without validation data)
        :type eps: `str` or `callable`
        """
        super(ProvenanceDefense, self).__init__(classifier, x_train, y_train)
        self.p_train = p_train
        self.num_devices = self.p_train.shape[1]
        self.x_val = x_val
        self.y_val = y_val
        self.eps = eps
        self.perf_func = perf_func
        self.pp_valid = pp_valid
        self.assigned_clean_by_device = []
        self.is_clean_by_device = []
        self.errors_by_device = []
        self.evaluator = GroundTruthEvaluator()
        self.is_clean_lst = []
        self.set_params(**kwargs)

    def evaluate_defence(self, is_clean, **kwargs):
        """
        Returns confusion matrix.

        :param is_clean: Ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means
                         x_train[i] is poisonous.
        :type is_clean: :class `np.ndarray`
        :param kwargs: A dictionary of defence-specific parameters.
        :type kwargs: `dict`
        :return: JSON object with confusion matrix.
        :rtype: `jsonObject`
        """
        if is_clean is None or is_clean.size == 0:
            raise ValueError(
                "is_clean was not provided while invoking evaluate_defence.")
        self.set_params(**kwargs)

        if not self.assigned_clean_by_device:
            self.detect_poison()

        self.is_clean_by_device = segment_by_class(is_clean, self.p_train,
                                                   self.num_devices)
        self.errors_by_device, conf_matrix_json = self.evaluator.analyze_correctness(
            self.assigned_clean_by_device, self.is_clean_by_device)
        return conf_matrix_json

    def detect_poison(self, **kwargs):
        """
        Returns poison detected and a report.

        :param kwargs: A dictionary of detection-specific parameters.
        :type kwargs: `dict`
        :return: (report, is_clean_lst):
                where a report is a dict object that contains information specified by the provenance detection method
                where is_clean is a list, where is_clean_lst[i]=1 means that x_train[i]
                there is clean and is_clean_lst[i]=0, means that x_train[i] was classified as poison.
        :rtype: `tuple`
        """
        self.set_params(**kwargs)

        if self.x_val is None:
            report = self.detect_poison_untrusted()
        else:
            report = self.detect_poison_partially_trusted()

        n_train = len(self.x_train)
        indices_by_provenance = segment_by_class(np.arange(n_train),
                                                 self.p_train,
                                                 self.num_devices)
        self.is_clean_lst = np.array([1] * n_train)

        for device in report:
            self.is_clean_lst[indices_by_provenance[device]] = 0
        self.assigned_clean_by_device = segment_by_class(
            np.array(self.is_clean_lst), self.p_train, self.num_devices)

        return report, self.is_clean_lst

    def detect_poison_partially_trusted(self, **kwargs):
        """
        Detect poison given trusted validation data

        :return: dictionary where keys are suspected poisonous device indices and values are performance differences
        :rtype: `dict`
        """
        self.set_params(**kwargs)

        if self.x_val is None or self.y_val is None:
            raise ValueError("Trusted data unavailable")

        suspected = {}

        unfiltered_data = np.copy(self.x_train)
        unfiltered_labels = np.copy(self.y_train)

        segments = segment_by_class(self.x_train, self.p_train,
                                    self.num_devices)
        for device_idx, segment in enumerate(segments):
            filtered_data, filtered_labels = self.filter_input(
                unfiltered_data, unfiltered_labels, segment)

            unfiltered_model = deepcopy(self.classifier)
            filtered_model = deepcopy(self.classifier)

            unfiltered_model.fit(unfiltered_data, unfiltered_labels)
            filtered_model.fit(filtered_data, filtered_labels)

            var_w = performance_diff(filtered_model,
                                     unfiltered_model,
                                     self.x_val,
                                     self.y_val,
                                     perf_function=self.perf_func)
            if self.eps < var_w:
                suspected[device_idx] = var_w
                unfiltered_data = filtered_data
                unfiltered_labels = filtered_labels

        return suspected

    def detect_poison_untrusted(self, **kwargs):
        """
        Detect poison given no trusted validation data

        :return: dictionary where keys are suspected poisonous device indices and values are performance differences
        :rtype: `dict`
        """
        self.set_params(**kwargs)

        suspected = {}

        train_data, valid_data, train_labels, valid_labels, train_prov, valid_prov = \
            train_test_split(self.x_train, self.y_train, self.p_train, test_size=self.pp_valid)

        train_segments = segment_by_class(train_data, train_prov,
                                          self.num_devices)
        valid_segments = segment_by_class(valid_data, valid_prov,
                                          self.num_devices)

        for device_idx, (train_segment, valid_segment) in enumerate(
                zip(train_segments, valid_segments)):
            filtered_data, filtered_labels = self.filter_input(
                train_data, train_labels, train_segment)

            unfiltered_model = deepcopy(self.classifier)
            filtered_model = deepcopy(self.classifier)

            unfiltered_model.fit(train_data, train_labels)
            filtered_model.fit(filtered_data, filtered_labels)

            valid_non_device_data, valid_non_device_labels = \
                self.filter_input(valid_data, valid_labels, valid_segment)
            var_w = performance_diff(filtered_model,
                                     unfiltered_model,
                                     valid_non_device_data,
                                     valid_non_device_labels,
                                     perf_function=self.perf_func)

            if self.eps < var_w:
                suspected[device_idx] = var_w
                train_data = filtered_data
                train_labels = filtered_labels
                valid_data = valid_non_device_data
                valid_labels = valid_non_device_labels

        return suspected

    @staticmethod
    def filter_input(data, labels, segment):
        """
        Return the data and labels that are not part of a specified segment

        :param data: The data to segment
        :type data: `np.ndarray`
        :param labels: The corresponding labels to segment
        :type labels: `np.ndarray`
        :param segment:
        :return: tupe of (filtered_data, filtered_labels)
        :rtype: (`np.ndarray`, `np.ndarray`)
        """
        filter_mask = np.array([
            np.isin(data[i, :], segment, invert=True).any()
            for i in range(data.shape[0])
        ])
        filtered_data = data[filter_mask]
        filtered_labels = labels[filter_mask]

        return filtered_data, filtered_labels

    def set_params(self, **kwargs):
        """
        Take in a dictionary of parameters and applies defence-specific checks before saving them as attributes.
        If a parameter is not provided, it takes its default value.
        """
        # Save defence-specific parameters
        super(ProvenanceDefense, self).set_params(**kwargs)

        if self.eps < 0:
            raise ValueError("Value of epsilon must be at least 0")

        if self.pp_valid < 0:
            raise ValueError("Value of pp_valid must be at least 0")

        if len(self.x_train) != len(self.y_train):
            raise ValueError("x_train and y_train do not match in shape")

        if len(self.x_train) != len(self.p_train):
            raise ValueError("Provenance features do not match data")

        return True
class RONIDefense(PoisonFilteringDefence):
    """
    Close implementation based on description in Nelson
    'Behavior of Machine Learning Algorithms in Adversarial Environments' Ch. 4.4

    | Textbook link: https://people.eecs.berkeley.edu/~adj/publications/paper-files/EECS-2010-140.pdf
    """
    defence_params = [
        'classifier', 'x_train', 'y_train', 'x_val', 'y_val', 'perf_func',
        'calibrated', 'eps'
    ]

    def __init__(self,
                 classifier,
                 x_train,
                 y_train,
                 x_val,
                 y_val,
                 perf_func='accuracy',
                 pp_cal=0.2,
                 pp_quiz=0.2,
                 calibrated=True,
                 eps=0.1,
                 **kwargs):
        """
        Create an :class:`.ActivationDefence` object with the provided classifier.

        :param classifier: Model evaluated for poison.
        :type classifier: :class:`art.classifiers.Classifier`
        :param x_train: dataset used to train the classifier.
        :type x_train: `np.ndarray`
        :param y_train: labels used to train the classifier.
        :type y_train: `np.ndarray`
        :param x_val: trusted data points
        :type x_val: `np.ndarray`
        :param y_train: trusted data labels
        :type y_train: `np.ndarray`
        :param perf_func: performance function to use
        :type perf_func: `str` or `callable`
        :param pp_cal: percent of training data used for calibration
        :type pp_cal: `float`
        :param pp_quiz: percent of training data used for quiz set
        :type pp_quiz: `float`
        :param calibrated: True if using the calibrated form of RONI
        :type calibrated: `bool`
        :param eps: performance threshold if using uncalibrated RONI
        :type eps: `float`
        """
        super(RONIDefense, self).__init__(classifier, x_train, y_train)
        n_points = len(x_train)
        quiz_idx = np.random.randint(n_points, size=int(pp_quiz * n_points))
        self.calibrated = calibrated
        self.x_quiz = np.copy(self.x_train[quiz_idx])
        self.y_quiz = np.copy(self.y_train[quiz_idx])
        if self.calibrated:
            _, self.x_cal, _, self.y_cal = train_test_split(self.x_train,
                                                            self.y_train,
                                                            test_size=pp_cal,
                                                            shuffle=True)
        self.eps = eps
        self.evaluator = GroundTruthEvaluator()
        self.x_val = x_val
        self.y_val = y_val
        self.perf_func = perf_func
        self.is_clean_lst = list()
        self.set_params(**kwargs)

    def evaluate_defence(self, is_clean, **kwargs):
        """
        Returns confusion matrix.

        :param is_clean: Ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means
                         x_train[i] is poisonous.
        :type is_clean: :class `np.ndarray`
        :param kwargs: A dictionary of defence-specific parameters.
        :type kwargs: `dict`
        :return: JSON object with confusion matrix.
        :rtype: `jsonObject`
        """
        self.set_params(**kwargs)
        if len(self.is_clean_lst) == 0:
            self.detect_poison()

        if is_clean is None or len(is_clean) != len(self.is_clean_lst):
            raise ValueError("Invalid value for is_clean.")

        _, conf_matrix = self.evaluator.analyze_correctness(
            [self.is_clean_lst], [is_clean])
        return conf_matrix

    def detect_poison(self, **kwargs):
        """
        Returns poison detected and a report.

        :param kwargs: A dictionary of detection-specific parameters.
        :type kwargs: `dict`
        :return: (report, is_clean_lst):
                where a report is a dict object that contains information specified by the provenance detection method
                where is_clean is a list, where is_clean_lst[i]=1 means that x_train[i]
                there is clean and is_clean_lst[i]=0, means that x_train[i] was classified as poison.
        :rtype: `tuple`
        """
        self.set_params(**kwargs)

        x_suspect = self.x_train
        y_suspect = self.y_train
        x_trusted = self.x_val
        y_trusted = self.y_val

        self.is_clean_lst = [1 for _ in range(len(x_suspect))]
        report = {}

        before_classifier = deepcopy(self.classifier)
        before_classifier.fit(x_suspect, y_suspect)

        for idx in np.random.permutation(len(x_suspect)):
            x_i = x_suspect[idx]
            y_i = y_suspect[idx]

            after_classifier = deepcopy(before_classifier)
            after_classifier.fit(x=np.vstack([x_trusted, x_i]),
                                 y=np.vstack([y_trusted, y_i]))
            acc_shift = performance_diff(before_classifier,
                                         after_classifier,
                                         self.x_quiz,
                                         self.y_quiz,
                                         perf_function=self.perf_func)
            # print(acc_shift, median, std_dev)
            if self.is_suspicious(before_classifier, acc_shift):
                self.is_clean_lst[idx] = 0
                report[idx] = acc_shift
            else:
                before_classifier = after_classifier
                x_trusted = np.vstack([x_trusted, x_i])
                y_trusted = np.vstack([y_trusted, y_i])

        return report, self.is_clean_lst

    def is_suspicious(self, before_classifier, perf_shift):
        """
        Returns True if a given performance shift is suspicious

        :param before_classifier: The classifier without untrusted data
        :type before_classifier: `art.classifiers.classifier.Classifier`
        :param perf_shift: a shift in performance
        :type perf_shift: `float`
        :return: True if a given performance shift is suspicious. False otherwise.
        :rtype: `bool`
        """
        if self.calibrated:
            median, std_dev = self.get_calibration_info(before_classifier)
            return perf_shift < median - 3 * std_dev

        return perf_shift < -self.eps

    def get_calibration_info(self, before_classifier):
        """
        Calculate the median and standard deviation of the accuracy shifts caused
        by the calibration set.

        :param before_classifier: The classifier trained without suspicious point
        :type before_classifier: `art.classifiers.classifier.Classifier`
        :return: a tuple consisting of (`median`, `std_dev`)
        :rtype: (`float`, `float`)
        """
        accs = []

        for x_c, y_c in zip(self.x_cal, self.y_cal):
            after_classifier = deepcopy(before_classifier)
            after_classifier.fit(x=np.vstack([self.x_val, x_c]),
                                 y=np.vstack([self.y_val, y_c]))
            accs.append(
                performance_diff(before_classifier,
                                 after_classifier,
                                 self.x_quiz,
                                 self.y_quiz,
                                 perf_function=self.perf_func))

        return np.median(accs), np.std(accs)

    def set_params(self, **kwargs):
        """
        Take in a dictionary of parameters and applies defence-specific checks before saving them as attributes.
        If a parameter is not provided, it takes its default value.
        """
        super(RONIDefense, self).set_params(**kwargs)

        if len(self.x_train) != len(self.y_train):
            raise ValueError("x_train and y_train do not match shape")

        if self.eps < 0:
            raise ValueError("Value of epsilon must be at least 0")

        return True
class ActivationDefence(PoisonFilteringDefence):
    """
    Class performing Activation Analysis Defence
    """
    defence_params = [
        'n_clusters', 'clustering_method', 'ndims', 'reduce',
        'cluster_analysis'
    ]
    valid_clustering = ['KMeans']
    valid_reduce = ['PCA', 'FastICA', 'TSNE']
    valid_analysis = ['smaller', 'distance']
    TOO_SMALL_ACTIVATIONS = 32  # Threshold used to print a warning when activations are not enough

    def __init__(self, classifier, x_train, y_train, verbose=True):
        """
        Create an ActivationDefence object with the provided classifier

        :param classifier: model evaluated for poison
        :type classifier: :class:`Classifier`
        :param x_train: dataset used to train `classifier`
        :type x_train: :class:`numpy.ndarray`
        :param y_train: labels used to train `classifier`
        :type y_train: :class:`numpy.ndarray`
        :param verbose: When True prints more information
        :type verbose: `bool`
        """
        super(ActivationDefence, self).__init__(classifier, x_train, y_train,
                                                verbose)
        kwargs = {
            'n_clusters': 2,
            'clustering_method': "KMeans",
            'ndims': 10,
            'reduce': 'PCA',
            'cluster_analysis': "smaller"
        }
        self.set_params(**kwargs)
        self.activations_by_class = []
        self.clusters_by_class = []
        self.assigned_clean_by_class = []
        self.is_clean_by_class = []
        self.errors_by_class = []
        self.red_activations_by_class = []  # Activations reduced by class
        self.evaluator = GroundTruthEvaluator()
        self.is_clean_lst = []
        self.confidence_level = []

    def evaluate_defence(self, is_clean, **kwargs):
        """
        Returns confusion matrix.

        :param is_clean: ground truth, where is_clean[i]=1 means that x_train[i] is clean and is_clean[i]=0 means
                         x_train[i] is poisonous
        :type is_clean: :class `list`
        :param kwargs: a dictionary of defence-specific parameters
        :type kwargs: `dict`
        :return: JSON object with confusion matrix
        """
        self.set_params(**kwargs)

        if len(self.activations_by_class) == 0:
            activations = self._get_activations()
            self.activations_by_class = self._segment_by_class(
                activations, self.y_train)

        self.clusters_by_class, self.red_activations_by_class = self.cluster_activations(
        )
        self.assigned_clean_by_class = self.analyze_clusters()

        # Now check ground truth:
        self.is_clean_by_class = self._segment_by_class(is_clean, self.y_train)
        self.errors_by_class, conf_matrix_json = self.evaluator.analyze_correctness(
            self.assigned_clean_by_class,
            self.is_clean_by_class,
            verbose=self.verbose)
        return conf_matrix_json

    def detect_poison(self, **kwargs):
        """
        Returns poison detected.

        :param kwargs: a dictionary of detection-specific parameters
        :type kwargs: `dict`
        :return: 1) confidence_level, 2) is_clean_lst : type List[int], where is_clean_lst[i]=1 means that x_train[i]
                there is clean and is_clean_lst[i]=0, means that x_train[i] was classified as poison
        :rtype: `tuple`
        """
        self.set_params(**kwargs)

        if len(self.activations_by_class) == 0:
            activations = self._get_activations()
            self.activations_by_class = self._segment_by_class(
                activations, self.y_train)
        self.clusters_by_class, self.red_activations_by_class = self.cluster_activations(
        )
        self.assigned_clean_by_class = self.analyze_clusters()
        # Here, assigned_clean_by_class[i][j] is 1 if the jth datapoint in the ith class was
        # determined to be clean by activation cluster

        # Build an array that matches the original indexes of x_train
        n_train = len(self.x_train)
        indices_by_class = self._segment_by_class(np.arange(n_train),
                                                  self.y_train)
        self.is_clean_lst = [0] * n_train
        self.confidence_level = [1] * n_train
        for i, (assigned_clean, dp) in enumerate(
                zip(self.assigned_clean_by_class, indices_by_class)):
            for j, (assignment, index_dp) in enumerate(zip(assigned_clean,
                                                           dp)):
                if assignment == 1:
                    self.is_clean_lst[index_dp] = 1

        return self.confidence_level, self.is_clean_lst

    def cluster_activations(self, **kwargs):
        """
        Clusters activations and returns cluster_by_class and red_activations_by_class,
        where cluster_by_class[i][j] is the cluster to which the j-th datapoint in the
        ith class belongs and the correspondent activations reduced by class
        red_activations_by_class[i][j]

        :param kwargs: a dictionary of cluster-specific parameters
        :type kwargs: `dict`
        :return: `tuple`
        """
        self.set_params(**kwargs)
        if len(self.activations_by_class) == 0:
            activations = self._get_activations()
            self.activations_by_class = self._segment_by_class(
                activations, self.y_train)

        my_clust = ClusteringHandler()
        [self.clusters_by_class,
         self.red_activations_by_class] = my_clust.cluster_activations(
             self.activations_by_class,
             n_clusters=self.n_clusters,
             ndims=self.ndims,
             reduce=self.reduce,
             clustering_method=self.clustering_method)

        return self.clusters_by_class, self.red_activations_by_class

    def analyze_clusters(self, **kwargs):
        """
        This function analyzes the clusters according to the provided method

        :param kwargs: a dictionary of cluster-analysis-specific parameters
        :type kwargs: `dict`
        :return: Assigned_clean_by_class, an array of arrays that contains what data points where classified as clean.
        """
        self.set_params(**kwargs)

        if len(self.clusters_by_class) == 0:
            self.cluster_activations()

        if self.cluster_analysis == 'smaller':
            analyzer = SizeAnalyzer()
            self.assigned_clean_by_class = analyzer.analyze_clusters(
                self.clusters_by_class)
        elif self.cluster_analysis == 'distance':
            analyzer = DistanceAnalyzer()
            self.assigned_clean_by_class = analyzer.analyze_clusters(
                self.clusters_by_class,
                separated_activations=self.red_activations_by_class)
        return self.assigned_clean_by_class

    def set_params(self, **kwargs):
        """
        Take in a dictionary of parameters and applies defence-specific checks before saving them as attributes.
        If a parameter is not provided, it takes its default value.

        :param n_clusters: Number of clusters to be produced. Should be greater than 2.
        :type n_clusters: `int`
        :param clustering_method: Clustering method to use
        :type clustering_method: `string`
        :param ndims: Number of dimensions to project on
        :type ndims: `int`
        :param reduce: Reduction technique
        :type reduce: `str`
        :param cluster_analysis: Method to analyze the clusters
        :type cluster_analysis: `str`
        """
        # Save defence-specific parameters
        super(ActivationDefence, self).set_params(**kwargs)

        if self.n_clusters <= 1:
            raise ValueError(
                "Wrong number of clusters, should be greater or equal to 2. Provided: "
                + str(self.n_clusters))
            return False
        if self.ndims <= 0:
            raise ValueError("Wrong number of dimensions ")
            return False
        if self.clustering_method not in self.valid_clustering:
            raise ValueError("Unsupported clustering method: " +
                             self.clustering_method)
            return False
        if self.reduce not in self.valid_reduce:
            raise ValueError("Unsupported reduction method: " + self.reduce)
            return False
        if self.cluster_analysis not in self.valid_analysis:
            raise ValueError(
                "Unsupported method for cluster analysis method: " +
                self.cluster_analysis)
            return False

        return True

    def _get_activations(self):
        """
        Find activations from class:Classifier
        """
        print('Getting activations..')

        nb_layers = len(self.classifier.layer_names)
        activations = self.classifier.get_activations(self.x_train,
                                                      layer=nb_layers - 1)

        # wrong way to get activations activations = self.classifier.predict(self.x_train, logits=True)
        nodes_last_layer = np.shape(activations)[1]

        if nodes_last_layer <= self.TOO_SMALL_ACTIVATIONS:
            print(
                "WARNING: Number of activations in last layer is too small... method may not work properly. "
                "Size: " + str(nodes_last_layer))
        return activations

    def _segment_by_class(self, data, features):
        """
        Returns segmented data according to specified features

        :param data: to be segmented
        :type data: :class:`numpy.ndarray`
        :param features: features used to segment data
                       e.g., segment according to predicted label or to y_train
        :type features: class:`numpy.ndarray`
        """
        n_classes = self.classifier.nb_classes
        by_class = [[] for i in range(n_classes)]
        for indx, feature in enumerate(features):
            if n_classes > 2:
                assigned = np.argmax(feature)
            else:
                assigned = int(feature)
            by_class[assigned].append(data[indx])

        return [np.asarray(i) for i in by_class]