def _check_gradient(self, classifier: "CLASSIFIER_LOSS_GRADIENTS_TYPE",
                        x: np.ndarray, y: np.ndarray,
                        **kwargs: Union[str, bool, int, float]) -> None:
        """
        Check if potential gradient obfuscation can be detected. Projected Gradient Descent with 100 iterations is run
        with maximum attack budget `eps` being equal to upper clip value of input data and `eps_step` of
        `eps / (max_iter / 2)`.

        :param classifier: A trained classifier that provides loss gradients.
        :param x: Input data to classifier for evaluation.
        :param y: True labels for input data `x`.
        :param kwargs: Keyword arguments for the Projected Gradient Descent attack used for evaluation, except keywords
                       `classifier` and `eps`.
        """
        # Define parameters for Projected Gradient Descent
        max_iter = 100
        kwargs["max_iter"] = max_iter
        kwargs["eps"] = classifier.clip_values[1]
        kwargs["eps_step"] = classifier.clip_values[1] / (max_iter / 2)

        # Create attack
        attack_pgd = ProjectedGradientDescent(estimator=classifier, **kwargs)

        # Evaluate accuracy with maximal attack budget
        x_adv = attack_pgd.generate(x=x, y=y)
        y_pred_adv = classifier.predict(x=x_adv, y=y)
        accuracy_adv = self._get_accuracy(y=y, y_pred=y_pred_adv)

        # Decide of obfuscated gradients likely
        if accuracy_adv > 1 / classifier.nb_classes:
            self._detected_obfuscating_gradients = True
        else:
            self._detected_obfuscating_gradients = False
Exemplo n.º 2
0
    def test_5_scikitlearn(self):
        from sklearn.linear_model import LogisticRegression
        from sklearn.svm import SVC, LinearSVC

        from art.estimators.classification.scikitlearn import ScikitlearnLogisticRegression, ScikitlearnSVC

        scikitlearn_test_cases = {
            LogisticRegression: ScikitlearnLogisticRegression,
            SVC: ScikitlearnSVC,
            LinearSVC: ScikitlearnSVC,
        }

        (_, _), (x_test, y_test) = self.iris

        for (model_class, classifier_class) in scikitlearn_test_cases.items():
            model = model_class()
            classifier = classifier_class(model=model, clip_values=(0, 1))
            classifier.fit(x=x_test, y=y_test)

            # Test untargeted attack
            attack = ProjectedGradientDescent(classifier, eps=1, eps_step=0.1, max_iter=5)
            x_test_adv = attack.generate(x_test)
            self.assertFalse((np.array(x_test) == x_test_adv).all())
            self.assertTrue((x_test_adv <= 1).all())
            self.assertTrue((x_test_adv >= 0).all())

            preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
            self.assertFalse((np.argmax(np.array(y_test), axis=1) == preds_adv).all())
            acc = np.sum(preds_adv == np.argmax(np.array(y_test), axis=1)) / len(y_test)
            logger.info(
                "Accuracy of " + classifier.__class__.__name__ + " on Iris with PGD adversarial examples: " "%.2f%%",
                (acc * 100),
            )
Exemplo n.º 3
0
    def evaluate(  # type: ignore
        self,
        classifier: "CLASSIFIER_LOSS_GRADIENTS_TYPE",
        x: np.ndarray,
        y: np.ndarray,
        **kwargs: Union[str, bool, int, float],
    ) -> Tuple[List[float], List[float], float]:
        """
        Evaluate the Security Curve of a classifier using Projected Gradient Descent.

        :param classifier: A trained classifier that provides loss gradients.
        :param x: Input data to classifier for evaluation.
        :param y: True labels for input data `x`.
        :param kwargs: Keyword arguments for the Projected Gradient Descent attack used for evaluation, except keywords
                       `classifier` and `eps`.
        :return: List of evaluated `eps` values, List of adversarial accuracies, and benign accuracy.
        """

        kwargs.pop("classifier", None)
        kwargs.pop("eps", None)
        self.eps_list.clear()
        self.accuracy_adv_list.clear()
        self.accuracy = None

        # Check type of eps
        if isinstance(self.eps, int):
            if classifier.clip_values is not None:
                eps_increment = (classifier.clip_values[1] -
                                 classifier.clip_values[0]) / self.eps
            else:
                eps_increment = (np.max(x) - np.min(x)) / self.eps

            for i in range(1, self.eps + 1):
                self.eps_list.append(float(i * eps_increment))

        else:
            self.eps_list = [float(eps) for eps in self.eps]

        # Determine benign accuracy
        y_pred = classifier.predict(x=x, y=y)
        self.accuracy = self._get_accuracy(y=y, y_pred=y_pred)

        # Determine adversarial accuracy for each eps
        for eps in self.eps_list:
            attack_pgd = ProjectedGradientDescent(estimator=classifier,
                                                  eps=eps,
                                                  **kwargs)  # type: ignore

            x_adv = attack_pgd.generate(x=x, y=y)

            y_pred_adv = classifier.predict(x=x_adv, y=y)
            accuracy_adv = self._get_accuracy(y=y, y_pred=y_pred_adv)
            self.accuracy_adv_list.append(accuracy_adv)

        # Check gradients for potential obfuscation
        self._check_gradient(classifier=classifier, x=x, y=y, **kwargs)

        return self.eps_list, self.accuracy_adv_list, self.accuracy
Exemplo n.º 4
0
def test_pytorch_binary_pgd(art_warning, get_mnist_dataset):
    """
    This test instantiates a binary classification PyTorch model, then attacks it using PGD

    """
    class BasicModel(nn.Module):
        def __init__(self):
            super(BasicModel, self).__init__()
            self.layer_1 = nn.Linear(20, 32)
            self.layer_2 = nn.Linear(32, 1)

        def forward(self, x):
            x = F.relu(self.layer_1(x))
            x = torch.sigmoid(self.layer_2(x))

            return x

    try:
        device = "cpu"
        x, y = sklearn.datasets.make_classification(n_samples=10000,
                                                    n_features=20,
                                                    n_informative=5,
                                                    n_redundant=2,
                                                    n_repeated=0,
                                                    n_classes=2)
        train_x, test_x, train_y, test_y = sklearn.model_selection.train_test_split(
            x, y, test_size=0.2)
        train_x = test_x.astype(np.float32)
        train_y = train_y.astype(np.float32)
        test_x = test_x.astype(np.float32)
        model = BasicModel()
        loss_func = nn.BCELoss()
        model.to(device)
        opt = optim.Adam(model.parameters(), lr=0.001)
        classifier = PyTorchClassifier(
            model=model,
            loss=loss_func,
            optimizer=opt,
            input_shape=(1, 28, 28),
            nb_classes=2,
        )
        classifier.fit(train_x, train_y, batch_size=64, nb_epochs=3)
        test_x_batch = test_x[0:16]
        preds = classifier.predict(test_x_batch)
        attacker = ProjectedGradientDescent(classifier, eps=0.5)
        generated = attacker.generate(test_x_batch)
        adv_predicted = classifier.predict(generated)
        assert (adv_predicted != preds).all()
    except ARTTestException as e:
        art_warning(e)
Exemplo n.º 5
0
    def test_3_pytorch_iris_pt(self):
        (_, _), (x_test, y_test) = self.iris
        classifier = get_tabular_classifier_pt()

        # Test untargeted attack
        attack = ProjectedGradientDescent(classifier, eps=1, eps_step=0.1, max_iter=5)
        x_test_adv = attack.generate(x_test)
        self.assertFalse((np.array(x_test) == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1).all())
        self.assertTrue((x_test_adv >= 0).all())

        preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
        self.assertFalse((np.argmax(np.array(y_test), axis=1) == preds_adv).all())
        acc = np.sum(preds_adv == np.argmax(np.array(y_test), axis=1)) / len(y_test)
        logger.info("Accuracy on Iris with PGD adversarial examples: %.2f%%", (acc * 100))
    def test_keras_iris_unbounded(self):
        classifier = get_tabular_classifier_kr()

        # Recreate a classifier without clip values
        classifier = KerasClassifier(model=classifier._model, use_logits=False, channels_first=True)
        attack = ProjectedGradientDescent(classifier, eps=1, eps_step=0.2, max_iter=5)
        x_test_adv = attack.generate(self.x_test_iris)
        self.assertFalse((self.x_test_iris == x_test_adv).all())
        self.assertTrue((x_test_adv > 1).any())
        self.assertTrue((x_test_adv < 0).any())

        preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
        self.assertFalse((np.argmax(self.y_test_iris, axis=1) == preds_adv).all())
        acc = np.sum(preds_adv == np.argmax(self.y_test_iris, axis=1)) / self.y_test_iris.shape[0]
        logger.info("Accuracy on Iris with PGD adversarial examples: %.2f%%", (acc * 100))
    def __init__(
        self,
        classifier: "ClassifierGradients",
        nb_epochs: int = 391,
        batch_size: int = 128,
        eps: float = 8.0,
        eps_step: float = 2.0,
        max_iter: int = 7,
        num_random_init: Union[bool, int] = True,
    ) -> None:
        """
        Create an :class:`.AdversarialTrainerMadryPGD` instance.

        Default values are for CIFAR-10 in pixel range 0-255.

        :param classifier: Classifier to train adversarially.
        :param nb_epochs: Number of training epochs.
        :param batch_size: Size of the batch on which adversarial samples are generated.
        :param eps: Maximum perturbation that the attacker can introduce.
        :param eps_step: Attack step size (input variation) at each iteration.
        :param max_iter: The maximum number of iterations.
        :param num_random_init: Number of random initialisations within the epsilon ball. For num_random_init=0
            starting at the original input.
        """
        super(AdversarialTrainerMadryPGD, self).__init__(classifier=classifier)  # type: ignore
        self.batch_size = batch_size
        self.nb_epochs = nb_epochs

        # Setting up adversary and perform adversarial training:
        self.attack = ProjectedGradientDescent(
            classifier, eps=eps, eps_step=eps_step, max_iter=max_iter, num_random_init=num_random_init,
        )

        self.trainer = AdversarialTrainer(classifier, self.attack, ratio=1.0)  # type: ignore
    def __init__(
        self,
        backdoor: PoisoningAttackBackdoor,
        proxy_classifier: "CLASSIFIER_LOSS_GRADIENTS_TYPE",
        target: np.ndarray,
        pp_poison: float = 0.33,
        norm: Union[int, float, str] = np.inf,
        eps: float = 0.3,
        eps_step: float = 0.1,
        max_iter: int = 100,
        num_random_init: int = 0,
    ) -> None:
        """
        Creates a new Clean Label Backdoor poisoning attack

        :param backdoor: the backdoor chosen for this attack
        :param proxy_classifier: the classifier for this attack ideally it solves the same or similar classification
                                 task as the original classifier
        :param target: The target label to poison
        :param pp_poison: The percentage of the data to poison. Note: Only data within the target label is poisoned
        :param norm: The norm of the adversarial perturbation supporting "inf", np.inf, 1 or 2.
        :param eps: Maximum perturbation that the attacker can introduce.
        :param eps_step: Attack step size (input variation) at each iteration.
        :param max_iter: The maximum number of iterations.
        :param num_random_init: Number of random initialisations within the epsilon ball. For num_random_init=0 starting
                                at the original input.
        """
        super().__init__()
        self.backdoor = backdoor
        self.proxy_classifier = proxy_classifier
        self.target = target
        self.pp_poison = pp_poison
        self.attack = ProjectedGradientDescent(
            proxy_classifier,
            norm=norm,
            eps=eps,
            eps_step=eps_step,
            max_iter=max_iter,
            targeted=False,
            num_random_init=num_random_init,
        )
        self._check_params()
    def test_pytorch_iris_pt(self):
        classifier = get_tabular_classifier_pt()

        # Test untargeted attack
        attack = ProjectedGradientDescent(classifier, eps=1, eps_step=0.1, max_iter=5)
        x_test_adv = attack.generate(self.x_test_iris)
        self.assertFalse((self.x_test_iris == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1).all())
        self.assertTrue((x_test_adv >= 0).all())

        preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
        self.assertFalse((np.argmax(self.y_test_iris, axis=1) == preds_adv).all())
        acc = np.sum(preds_adv == np.argmax(self.y_test_iris, axis=1)) / self.y_test_iris.shape[0]
        logger.info("Accuracy on Iris with PGD adversarial examples: %.2f%%", (acc * 100))

        # Test targeted attack
        targets = random_targets(self.y_test_iris, nb_classes=3)
        attack = ProjectedGradientDescent(classifier, targeted=True, eps=1, eps_step=0.1, max_iter=5)
        x_test_adv = attack.generate(self.x_test_iris, **{"y": targets})
        self.assertFalse((self.x_test_iris == x_test_adv).all())
        self.assertTrue((x_test_adv <= 1).all())
        self.assertTrue((x_test_adv >= 0).all())

        preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
        self.assertTrue((np.argmax(targets, axis=1) == preds_adv).any())
        acc = np.sum(preds_adv == np.argmax(targets, axis=1)) / self.y_test_iris.shape[0]
        logger.info("Success rate of targeted PGD on Iris: %.2f%%", (acc * 100))
Exemplo n.º 10
0
    def _test_backend_mnist(self, classifier, x_train, y_train, x_test, y_test):
        x_test_original = x_test.copy()

        # Test PGD with np.inf norm
        attack = ProjectedGradientDescent(classifier, eps=1, eps_step=0.1, max_iter=5)
        x_train_adv = attack.generate(x_train)
        x_test_adv = attack.generate(x_test)

        self.assertFalse((x_train == x_train_adv).all())
        self.assertFalse((x_test == x_test_adv).all())

        train_y_pred = get_labels_np_array(classifier.predict(x_train_adv))
        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))

        self.assertFalse((y_train == train_y_pred).all())
        self.assertFalse((y_test == test_y_pred).all())

        acc = np.sum(np.argmax(train_y_pred, axis=1) == np.argmax(y_train, axis=1)) / len(y_train)
        logger.info("Accuracy on adversarial train examples: %.2f%%", acc * 100)

        acc = np.sum(np.argmax(test_y_pred, axis=1) == np.argmax(np.array(y_test), axis=1)) / len(y_test)
        logger.info("Accuracy on adversarial test examples: %.2f%%", acc * 100)

        # Test PGD with 3 random initialisations
        attack = ProjectedGradientDescent(classifier, num_random_init=3, max_iter=5)
        x_train_adv = attack.generate(x_train)
        x_test_adv = attack.generate(x_test)

        self.assertFalse((x_train == x_train_adv).all())
        self.assertFalse((x_test == x_test_adv).all())

        train_y_pred = get_labels_np_array(classifier.predict(x_train_adv))
        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))

        self.assertFalse((y_train == train_y_pred).all())
        self.assertFalse((y_test == test_y_pred).all())

        acc = np.sum(np.argmax(train_y_pred, axis=1) == np.argmax(y_train, axis=1)) / len(y_train)
        logger.info("Accuracy on adversarial train examples with 3 random initialisations: %.2f%%", acc * 100)

        acc = np.sum(np.argmax(test_y_pred, axis=1) == np.argmax(np.array(y_test), axis=1)) / len(y_test)
        logger.info("Accuracy on adversarial test examples with 3 random initialisations: %.2f%%", acc * 100)

        # Check that x_test has not been modified by attack and classifier
        self.assertAlmostEqual(float(np.max(np.abs(np.array(x_test_original) - np.array(x_test)))), 0.0, delta=0.00001)
    def test_7_scikitlearn(self):
        from sklearn.linear_model import LogisticRegression
        from sklearn.svm import SVC, LinearSVC

        from art.estimators.classification.scikitlearn import SklearnClassifier

        scikitlearn_test_cases = [
            LogisticRegression(solver="lbfgs", multi_class="auto"),
            SVC(gamma="auto"),
            LinearSVC(),
        ]

        x_test_original = self.x_test_iris.copy()

        for model in scikitlearn_test_cases:
            classifier = SklearnClassifier(model=model, clip_values=(0, 1))
            classifier.fit(x=self.x_test_iris, y=self.y_test_iris)

            # Test untargeted attack
            attack = ProjectedGradientDescent(classifier, eps=1.0, eps_step=0.1, max_iter=5, verbose=False)
            x_test_adv = attack.generate(self.x_test_iris)
            self.assertFalse((self.x_test_iris == x_test_adv).all())
            self.assertTrue((x_test_adv <= 1).all())
            self.assertTrue((x_test_adv >= 0).all())

            preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
            self.assertFalse((np.argmax(self.y_test_iris, axis=1) == preds_adv).all())
            acc = np.sum(preds_adv == np.argmax(self.y_test_iris, axis=1)) / self.y_test_iris.shape[0]
            logger.info(
                "Accuracy of " + classifier.__class__.__name__ + " on Iris with PGD adversarial examples: " "%.2f%%",
                (acc * 100),
            )

            # Test targeted attack
            targets = random_targets(self.y_test_iris, nb_classes=3)
            attack = ProjectedGradientDescent(
                classifier, targeted=True, eps=1.0, eps_step=0.1, max_iter=5, verbose=False
            )
            x_test_adv = attack.generate(self.x_test_iris, **{"y": targets})
            self.assertFalse((self.x_test_iris == x_test_adv).all())
            self.assertTrue((x_test_adv <= 1).all())
            self.assertTrue((x_test_adv >= 0).all())

            preds_adv = np.argmax(classifier.predict(x_test_adv), axis=1)
            self.assertTrue((np.argmax(targets, axis=1) == preds_adv).any())
            acc = np.sum(preds_adv == np.argmax(targets, axis=1)) / self.y_test_iris.shape[0]
            logger.info(
                "Success rate of " + classifier.__class__.__name__ + " on targeted PGD on Iris: %.2f%%", (acc * 100)
            )

            # Check that x_test has not been modified by attack and classifier
            self.assertAlmostEqual(float(np.max(np.abs(x_test_original - self.x_test_iris))), 0.0, delta=0.00001)
class PoisoningAttackCleanLabelBackdoor(PoisoningAttackBlackBox):
    """
    Implementation of Clean-Label Backdoor Attacks introduced in Gu, et. al. 2017

    Applies a number of backdoor perturbation functions and switches label to target label

    | Paper link: https://arxiv.org/abs/1708.06733
    """

    attack_params = PoisoningAttackBlackBox.attack_params + [
        "backdoor",
        "proxy_classifier",
        "target",
        "pp_poison",
        "norm",
        "eps",
        "eps_step",
        "max_iter",
        "num_random_init",
    ]
    _estimator_requirements = ()

    def __init__(
        self,
        backdoor: PoisoningAttackBackdoor,
        proxy_classifier: "CLASSIFIER_LOSS_GRADIENTS_TYPE",
        target: np.ndarray,
        pp_poison: float = 0.33,
        norm: Union[int, float, str] = np.inf,
        eps: float = 0.3,
        eps_step: float = 0.1,
        max_iter: int = 100,
        num_random_init: int = 0,
    ) -> None:
        """
        Creates a new Clean Label Backdoor poisoning attack

        :param backdoor: the backdoor chosen for this attack
        :param proxy_classifier: the classifier for this attack ideally it solves the same or similar classification
                                 task as the original classifier
        :param target: The target label to poison
        :param pp_poison: The percentage of the data to poison. Note: Only data within the target label is poisoned
        :param norm: The norm of the adversarial perturbation supporting "inf", np.inf, 1 or 2.
        :param eps: Maximum perturbation that the attacker can introduce.
        :param eps_step: Attack step size (input variation) at each iteration.
        :param max_iter: The maximum number of iterations.
        :param num_random_init: Number of random initialisations within the epsilon ball. For num_random_init=0 starting
                                at the original input.
        """
        super().__init__()
        self.backdoor = backdoor
        self.proxy_classifier = proxy_classifier
        self.target = target
        self.pp_poison = pp_poison
        self.attack = ProjectedGradientDescent(
            proxy_classifier,
            norm=norm,
            eps=eps,
            eps_step=eps_step,
            max_iter=max_iter,
            targeted=False,
            num_random_init=num_random_init,
        )
        self._check_params()

    def poison(  # pylint: disable=W0221
            self,
            x: np.ndarray,
            y: Optional[np.ndarray] = None,
            broadcast: bool = True,
            **kwargs) -> Tuple[np.ndarray, np.ndarray]:
        """
        Calls perturbation function on input x and returns the perturbed input and poison labels for the data.

        :param x: An array with the points that initialize attack points.
        :param y: The target labels for the attack.
        :param broadcast: whether or not to broadcast single target label
        :return: An tuple holding the `(poisoning_examples, poisoning_labels)`.
        """
        data = np.copy(x)
        estimated_labels = self.proxy_classifier.predict(
            data) if y is None else np.copy(y)

        # Selected target indices to poison
        all_indices = np.arange(len(data))
        target_indices = all_indices[np.all(estimated_labels == self.target,
                                            axis=1)]
        num_poison = int(self.pp_poison * len(target_indices))
        selected_indices = np.random.choice(target_indices, num_poison)

        # Run untargeted PGD on selected points, making it hard to classify correctly
        perturbed_input = self.attack.generate(data[selected_indices])
        no_change_detected = np.array([
            np.all(data[selected_indices][poison_idx] ==
                   perturbed_input[poison_idx])
            for poison_idx in range(len(perturbed_input))
        ])

        if any(no_change_detected):
            logger.warning(
                "Perturbed input is the same as original data after PGD. Check params."
            )
            idx_no_change = np.arange(
                len(no_change_detected))[no_change_detected]
            logger.warning("%d indices without change: %d", len(idx_no_change),
                           idx_no_change)

        # Add backdoor and poison with the same label
        poisoned_input, _ = self.backdoor.poison(perturbed_input,
                                                 self.target,
                                                 broadcast=broadcast)
        data[selected_indices] = poisoned_input

        return data, estimated_labels

    def _check_params(self) -> None:
        if not isinstance(self.backdoor, PoisoningAttackBackdoor):
            raise ValueError(
                "Backdoor must be of type PoisoningAttackBackdoor")
        if not isinstance(self.attack, ProjectedGradientDescent):
            raise ValueError("There was an issue creating the PGD attack")
        if not 0 < self.pp_poison < 1:
            raise ValueError("pp_poison must be between 0 and 1")
    def _test_framework_vs_numpy(self, classifier):
        # Test PGD with np.inf norm
        attack_np = ProjectedGradientDescentNumpy(
            classifier,
            eps=1.0,
            eps_step=0.1,
            max_iter=5,
            norm=np.inf,
            targeted=False,
            num_random_init=0,
            batch_size=3,
            random_eps=False,
        )
        x_train_adv_np = attack_np.generate(self.x_train_mnist)
        x_test_adv_np = attack_np.generate(self.x_test_mnist)

        attack_fw = ProjectedGradientDescent(
            classifier,
            eps=1.0,
            eps_step=0.1,
            max_iter=5,
            norm=np.inf,
            targeted=False,
            num_random_init=0,
            batch_size=3,
            random_eps=False,
        )
        x_train_adv_fw = attack_fw.generate(self.x_train_mnist)
        x_test_adv_fw = attack_fw.generate(self.x_test_mnist)

        # Test
        self.assertAlmostEqual(np.mean(x_train_adv_np - self.x_train_mnist),
                               np.mean(x_train_adv_fw - self.x_train_mnist),
                               places=6)
        self.assertAlmostEqual(np.mean(x_test_adv_np - self.x_test_mnist),
                               np.mean(x_test_adv_fw - self.x_test_mnist),
                               places=6)

        # Test PGD with L1 norm
        attack_np = ProjectedGradientDescentNumpy(
            classifier,
            eps=1.0,
            eps_step=0.1,
            max_iter=5,
            norm=1,
            targeted=False,
            num_random_init=0,
            batch_size=3,
            random_eps=False,
        )
        x_train_adv_np = attack_np.generate(self.x_train_mnist)
        x_test_adv_np = attack_np.generate(self.x_test_mnist)

        attack_fw = ProjectedGradientDescent(
            classifier,
            eps=1.0,
            eps_step=0.1,
            max_iter=5,
            norm=1,
            targeted=False,
            num_random_init=0,
            batch_size=3,
            random_eps=False,
        )
        x_train_adv_fw = attack_fw.generate(self.x_train_mnist)
        x_test_adv_fw = attack_fw.generate(self.x_test_mnist)

        # Test
        self.assertAlmostEqual(np.mean(x_train_adv_np - self.x_train_mnist),
                               np.mean(x_train_adv_fw - self.x_train_mnist),
                               places=6)
        self.assertAlmostEqual(np.mean(x_test_adv_np - self.x_test_mnist),
                               np.mean(x_test_adv_fw - self.x_test_mnist),
                               places=6)

        # Test PGD with L2 norm
        attack_np = ProjectedGradientDescentNumpy(
            classifier,
            eps=1.0,
            eps_step=0.1,
            max_iter=5,
            norm=2,
            targeted=False,
            num_random_init=0,
            batch_size=3,
            random_eps=False,
        )
        x_train_adv_np = attack_np.generate(self.x_train_mnist)
        x_test_adv_np = attack_np.generate(self.x_test_mnist)

        attack_fw = ProjectedGradientDescent(
            classifier,
            eps=1.0,
            eps_step=0.1,
            max_iter=5,
            norm=2,
            targeted=False,
            num_random_init=0,
            batch_size=3,
            random_eps=False,
        )
        x_train_adv_fw = attack_fw.generate(self.x_train_mnist)
        x_test_adv_fw = attack_fw.generate(self.x_test_mnist)

        # Test
        self.assertAlmostEqual(np.mean(x_train_adv_np - self.x_train_mnist),
                               np.mean(x_train_adv_fw - self.x_train_mnist),
                               places=6)
        self.assertAlmostEqual(np.mean(x_test_adv_np - self.x_test_mnist),
                               np.mean(x_test_adv_fw - self.x_test_mnist),
                               places=6)

        # Test PGD with True targeted
        attack_np = ProjectedGradientDescentNumpy(
            classifier,
            eps=1.0,
            eps_step=0.1,
            max_iter=5,
            norm=np.inf,
            targeted=True,
            num_random_init=0,
            batch_size=3,
            random_eps=False,
        )
        x_train_adv_np = attack_np.generate(self.x_train_mnist,
                                            self.y_train_mnist)
        x_test_adv_np = attack_np.generate(self.x_test_mnist,
                                           self.y_test_mnist)

        attack_fw = ProjectedGradientDescent(
            classifier,
            eps=1.0,
            eps_step=0.1,
            max_iter=5,
            norm=np.inf,
            targeted=True,
            num_random_init=0,
            batch_size=3,
            random_eps=False,
        )
        x_train_adv_fw = attack_fw.generate(self.x_train_mnist,
                                            self.y_train_mnist)
        x_test_adv_fw = attack_fw.generate(self.x_test_mnist,
                                           self.y_test_mnist)

        # Test
        self.assertAlmostEqual(np.mean(x_train_adv_np - self.x_train_mnist),
                               np.mean(x_train_adv_fw - self.x_train_mnist),
                               places=6)
        self.assertAlmostEqual(np.mean(x_test_adv_np - self.x_test_mnist),
                               np.mean(x_test_adv_fw - self.x_test_mnist),
                               places=6)

        # Test PGD with num_random_init=2
        master_seed(1234)
        attack_np = ProjectedGradientDescentNumpy(
            classifier,
            eps=1.0,
            eps_step=0.1,
            max_iter=5,
            norm=np.inf,
            targeted=False,
            num_random_init=2,
            batch_size=3,
            random_eps=False,
        )
        x_train_adv_np = attack_np.generate(self.x_train_mnist)
        x_test_adv_np = attack_np.generate(self.x_test_mnist)

        master_seed(1234)
        attack_fw = ProjectedGradientDescent(
            classifier,
            eps=1.0,
            eps_step=0.1,
            max_iter=5,
            norm=np.inf,
            targeted=False,
            num_random_init=2,
            batch_size=3,
            random_eps=False,
        )
        x_train_adv_fw = attack_fw.generate(self.x_train_mnist)
        x_test_adv_fw = attack_fw.generate(self.x_test_mnist)

        # Test
        self.assertAlmostEqual(np.mean(x_train_adv_np - self.x_train_mnist),
                               np.mean(x_train_adv_fw - self.x_train_mnist),
                               places=6)
        self.assertAlmostEqual(np.mean(x_test_adv_np - self.x_test_mnist),
                               np.mean(x_test_adv_fw - self.x_test_mnist),
                               places=6)

        # Test PGD with random_eps=True
        master_seed(1234)
        attack_np = ProjectedGradientDescentNumpy(
            classifier,
            eps=1.0,
            eps_step=0.1,
            max_iter=5,
            norm=np.inf,
            targeted=False,
            num_random_init=0,
            batch_size=3,
            random_eps=True,
        )
        x_train_adv_np = attack_np.generate(self.x_train_mnist)
        x_test_adv_np = attack_np.generate(self.x_test_mnist)

        master_seed(1234)
        attack_fw = ProjectedGradientDescent(
            classifier,
            eps=1.0,
            eps_step=0.1,
            max_iter=5,
            norm=np.inf,
            targeted=False,
            num_random_init=0,
            batch_size=3,
            random_eps=True,
        )
        x_train_adv_fw = attack_fw.generate(self.x_train_mnist)
        x_test_adv_fw = attack_fw.generate(self.x_test_mnist)

        # Test
        self.assertAlmostEqual(np.mean(x_train_adv_np - self.x_train_mnist),
                               np.mean(x_train_adv_fw - self.x_train_mnist),
                               places=6)
        self.assertAlmostEqual(np.mean(x_test_adv_np - self.x_test_mnist),
                               np.mean(x_test_adv_fw - self.x_test_mnist),
                               places=6)

        # Test the masking 1
        master_seed(1234)
        attack_np = ProjectedGradientDescentNumpy(
            classifier,
            eps=1.0,
            eps_step=0.1,
            max_iter=5,
            norm=np.inf,
            targeted=False,
            num_random_init=1,
            batch_size=3,
            random_eps=True,
        )

        mask = np.random.binomial(n=1,
                                  p=0.5,
                                  size=np.prod(self.x_train_mnist.shape))
        mask = mask.reshape(self.x_train_mnist.shape).astype(np.float32)
        x_train_adv_np = attack_np.generate(self.x_train_mnist, mask=mask)

        mask = np.random.binomial(n=1,
                                  p=0.5,
                                  size=np.prod(self.x_test_mnist.shape))
        mask = mask.reshape(self.x_test_mnist.shape).astype(np.float32)
        x_test_adv_np = attack_np.generate(self.x_test_mnist, mask=mask)

        master_seed(1234)
        attack_fw = ProjectedGradientDescent(
            classifier,
            eps=1.0,
            eps_step=0.1,
            max_iter=5,
            norm=np.inf,
            targeted=False,
            num_random_init=1,
            batch_size=3,
            random_eps=True,
        )

        mask = np.random.binomial(n=1,
                                  p=0.5,
                                  size=np.prod(self.x_train_mnist.shape))
        mask = mask.reshape(self.x_train_mnist.shape).astype(np.float32)
        x_train_adv_fw = attack_fw.generate(self.x_train_mnist, mask=mask)

        mask = np.random.binomial(n=1,
                                  p=0.5,
                                  size=np.prod(self.x_test_mnist.shape))
        mask = mask.reshape(self.x_test_mnist.shape).astype(np.float32)
        x_test_adv_fw = attack_fw.generate(self.x_test_mnist, mask=mask)

        # Test
        self.assertAlmostEqual(np.mean(x_train_adv_np - self.x_train_mnist),
                               np.mean(x_train_adv_fw - self.x_train_mnist),
                               places=6)
        self.assertAlmostEqual(np.mean(x_test_adv_np - self.x_test_mnist),
                               np.mean(x_test_adv_fw - self.x_test_mnist),
                               places=6)

        # Test the masking 2
        master_seed(1234)
        attack_np = ProjectedGradientDescentNumpy(
            classifier,
            eps=1.0,
            eps_step=0.1,
            max_iter=5,
            norm=np.inf,
            targeted=False,
            num_random_init=1,
            batch_size=3,
            random_eps=True,
        )

        mask = np.random.binomial(n=1,
                                  p=0.5,
                                  size=np.prod(self.x_train_mnist.shape[1:]))
        mask = mask.reshape(self.x_train_mnist.shape[1:]).astype(np.float32)
        x_train_adv_np = attack_np.generate(self.x_train_mnist, mask=mask)

        mask = np.random.binomial(n=1,
                                  p=0.5,
                                  size=np.prod(self.x_test_mnist.shape[1:]))
        mask = mask.reshape(self.x_test_mnist.shape[1:]).astype(np.float32)
        x_test_adv_np = attack_np.generate(self.x_test_mnist, mask=mask)

        master_seed(1234)
        attack_fw = ProjectedGradientDescent(
            classifier,
            eps=1.0,
            eps_step=0.1,
            max_iter=5,
            norm=np.inf,
            targeted=False,
            num_random_init=1,
            batch_size=3,
            random_eps=True,
        )

        mask = np.random.binomial(n=1,
                                  p=0.5,
                                  size=np.prod(self.x_train_mnist.shape[1:]))
        mask = mask.reshape(self.x_train_mnist.shape[1:]).astype(np.float32)
        x_train_adv_fw = attack_fw.generate(self.x_train_mnist, mask=mask)

        mask = np.random.binomial(n=1,
                                  p=0.5,
                                  size=np.prod(self.x_test_mnist.shape[1:]))
        mask = mask.reshape(self.x_test_mnist.shape[1:]).astype(np.float32)
        x_test_adv_fw = attack_fw.generate(self.x_test_mnist, mask=mask)

        # Test
        self.assertAlmostEqual(np.mean(x_train_adv_np - self.x_train_mnist),
                               np.mean(x_train_adv_fw - self.x_train_mnist),
                               places=6)
        self.assertAlmostEqual(np.mean(x_test_adv_np - self.x_test_mnist),
                               np.mean(x_test_adv_fw - self.x_test_mnist),
                               places=6)
    def _test_backend_mnist(self, classifier, x_train, y_train, x_test,
                            y_test):
        x_test_original = x_test.copy()

        # Test PGD with np.inf norm
        attack = ProjectedGradientDescent(classifier, eps=1.0, eps_step=0.1)
        x_train_adv = attack.generate(x_train)
        x_test_adv = attack.generate(x_test)

        self.assertFalse((x_train == x_train_adv).all())
        self.assertFalse((x_test == x_test_adv).all())

        train_y_pred = get_labels_np_array(classifier.predict(x_train_adv))
        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))

        self.assertFalse((y_train == train_y_pred).all())
        self.assertFalse((y_test == test_y_pred).all())

        acc = np.sum(
            np.argmax(train_y_pred, axis=1) == np.argmax(
                y_train, axis=1)) / y_train.shape[0]
        logger.info("Accuracy on adversarial train examples: %.2f%%",
                    acc * 100)

        acc = np.sum(
            np.argmax(test_y_pred, axis=1) == np.argmax(
                y_test, axis=1)) / y_test.shape[0]
        logger.info("Accuracy on adversarial test examples: %.2f%%", acc * 100)

        # Test PGD with 3 random initialisations
        attack = ProjectedGradientDescent(classifier, num_random_init=3)
        x_train_adv = attack.generate(x_train)
        x_test_adv = attack.generate(x_test)

        self.assertFalse((x_train == x_train_adv).all())
        self.assertFalse((x_test == x_test_adv).all())

        train_y_pred = get_labels_np_array(classifier.predict(x_train_adv))
        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))

        self.assertFalse((y_train == train_y_pred).all())
        self.assertFalse((y_test == test_y_pred).all())

        acc = np.sum(
            np.argmax(train_y_pred, axis=1) == np.argmax(
                y_train, axis=1)) / y_train.shape[0]
        logger.info(
            "Accuracy on adversarial train examples with 3 random initialisations: %.2f%%",
            acc * 100)

        acc = np.sum(
            np.argmax(test_y_pred, axis=1) == np.argmax(
                y_test, axis=1)) / y_test.shape[0]
        logger.info(
            "Accuracy on adversarial test examples with 3 random initialisations: %.2f%%",
            acc * 100)

        # Check that x_test has not been modified by attack and classifier
        self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test))),
                               0.0,
                               delta=0.00001)

        # Test the masking
        attack = ProjectedGradientDescent(classifier, num_random_init=1)
        mask = np.random.binomial(n=1, p=0.5, size=np.prod(x_test.shape))
        mask = mask.reshape(x_test.shape).astype(np.float32)

        x_test_adv = attack.generate(x_test, mask=mask)
        mask_diff = (1 - mask) * (x_test_adv - x_test)
        self.assertAlmostEqual(float(np.max(np.abs(mask_diff))),
                               0.0,
                               delta=0.00001)

        # Test eps of array type 1
        attack = ProjectedGradientDescent(classifier, eps=1.0, eps_step=0.1)

        eps = np.ones(shape=x_test.shape) * 1.0
        eps_step = np.ones_like(eps) * 0.1

        attack_params = {"eps_step": eps_step, "eps": eps}
        attack.set_params(**attack_params)

        x_test_adv = attack.generate(x_test)
        self.assertFalse((x_test == x_test_adv).all())

        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))
        self.assertFalse((y_test == test_y_pred).all())

        # Test eps of array type 2
        eps = np.ones(shape=x_test.shape[1:]) * 1.0
        eps_step = np.ones_like(eps) * 0.1

        attack_params = {"eps_step": eps_step, "eps": eps}
        attack.set_params(**attack_params)

        x_test_adv = attack.generate(x_test)
        self.assertFalse((x_test == x_test_adv).all())

        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))
        self.assertFalse((y_test == test_y_pred).all())

        # Test eps of array type 3
        eps = np.ones(shape=x_test.shape[2:]) * 1.0
        eps_step = np.ones_like(eps) * 0.1

        attack_params = {"eps_step": eps_step, "eps": eps}
        attack.set_params(**attack_params)

        x_test_adv = attack.generate(x_test)
        self.assertFalse((x_test == x_test_adv).all())

        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))
        self.assertFalse((y_test == test_y_pred).all())

        # Test eps of array type 4
        eps = np.ones(shape=x_test.shape[3:]) * 1.0
        eps_step = np.ones_like(eps) * 0.1

        attack_params = {"eps_step": eps_step, "eps": eps}
        attack.set_params(**attack_params)

        x_test_adv = attack.generate(x_test)
        self.assertFalse((x_test == x_test_adv).all())

        test_y_pred = get_labels_np_array(classifier.predict(x_test_adv))
        self.assertFalse((y_test == test_y_pred).all())
Exemplo n.º 15
0
    def test_check_params_pt(self):

        ptc = get_image_classifier_pt(from_logits=True)

        with self.assertRaises(TypeError):
            _ = ProjectedGradientDescent(ptc,
                                         eps=np.array([1, 1, 1]),
                                         eps_step=1)

        with self.assertRaises(ValueError):
            _ = ProjectedGradientDescent(ptc, norm=0)

        with self.assertRaises(ValueError):
            _ = ProjectedGradientDescent(ptc, eps=-1, eps_step=1)
        with self.assertRaises(ValueError):
            _ = ProjectedGradientDescent(ptc,
                                         eps=np.array([-1, -1, -1]),
                                         eps_step=np.array([1, 1, 1]))

        with self.assertRaises(ValueError):
            _ = ProjectedGradientDescent(ptc, eps=1, eps_step=-1)
        with self.assertRaises(ValueError):
            _ = ProjectedGradientDescent(ptc,
                                         eps=np.array([1, 1, 1]),
                                         eps_step=np.array([-1, -1, -1]))

        with self.assertRaises(ValueError):
            _ = ProjectedGradientDescent(ptc, targeted="true")

        with self.assertRaises(TypeError):
            _ = ProjectedGradientDescent(ptc, num_random_init=1.0)
        with self.assertRaises(ValueError):
            _ = ProjectedGradientDescent(ptc, num_random_init=-1)

        with self.assertRaises(ValueError):
            _ = ProjectedGradientDescent(ptc, batch_size=-1)

        with self.assertRaises(ValueError):
            _ = ProjectedGradientDescent(ptc, max_iter=-1)

        with self.assertRaises(ValueError):
            _ = ProjectedGradientDescent(ptc, verbose="true")
    def test_pytorch_binary_classifier(self):
        import torch
        import torch.nn.functional as F
        import torch.nn as nn
        import torch.optim as optim
        import art.estimators.classification
        import art.attacks.evasion
        import sklearn.datasets
        from art.attacks.evasion import ProjectedGradientDescent
        class BasicModel(nn.Module):
            def __init__(self):
                super(BasicModel, self).__init__()
                self.layer_1 = nn.Linear(20, 32)
                self.layer_2 = nn.Linear(32,1)
            def forward(self, x):
                x = F.relu(self.layer_1(x))
                x= torch.sigmoid(self.layer_2(x))

                return x
        device = "cpu"
        x, y = sklearn.datasets.make_classification(n_samples=10000, n_features=20, n_informative=5, n_redundant=2,
                                                    n_repeated=0, n_classes=2)

        train_x, test_x, train_y, test_y = sklearn.model_selection.train_test_split(x,y, test_size=0.2)
        # train_x, test_x, train_y, test_y =  torch.tensor(train_x).float(), torch.tensor(test_x).float(), torch.tensor(train_y).float(), torch.tensor(test_y).float()
        rand_inp = torch.randn((1, 20))
        check_d_type = type(train_x)
        model = BasicModel()
        outs = model(rand_inp)
        loss_func = nn.BCELoss()
        model.to(device)
        opt = optim.Adam(model.parameters(), lr=0.001)

        classifier = art.estimators.classification.PyTorchClassifier(
            model=model,
            loss=loss_func,
            optimizer=opt,
            input_shape=(1, 28, 28),
            nb_classes=2,
            # adv_criterion=lambda x, y: x>y
        )
        
        type_of = type(x)
        print(check_d_type)

        classifier.fit(train_x, train_y, batch_size=64, nb_epochs=3)
        test_x_batch = test_x[0:16]
        test_y_batch = test_y[0:16] 
        # test_loss = classifier.compute_loss(test_x_batch.to(device), test_y_batch.to(device))
        test_loss = classifier.compute_loss(test_x_batch, test_y_batch)
        print(test_loss)

        preds = classifier.predict(test_x_batch)
        #acts = art_classifier.get_activations(test_x_batch)
        grads = classifier.loss_gradient(test_x_batch, test_y_batch)

        attacker = ProjectedGradientDescent(classifier)

        generated = attacker.generate(test_x_batch)
        print(generated.shape)
        assert generated is not None