Exemplo n.º 1
0
    def test_4_pytorch_iris(self):
        classifier = get_tabular_classifier_pt()

        attack_params = {
            "max_iter": 1,
            "attacker": "ead",
            "attacker_params": {
                "max_iter": 5,
                "targeted": False,
                "verbose": False
            },
        }
        attack = UniversalPerturbation(classifier, verbose=False)
        attack.set_params(**attack_params)
        x_test_iris_adv = attack.generate(self.x_test_iris)
        self.assertFalse((self.x_test_iris == x_test_iris_adv).all())
        self.assertTrue((x_test_iris_adv <= 1).all())
        self.assertTrue((x_test_iris_adv >= 0).all())

        preds_adv = np.argmax(classifier.predict(x_test_iris_adv), axis=1)
        self.assertFalse((np.argmax(self.y_test_iris,
                                    axis=1) == preds_adv).all())
        acc = np.sum(preds_adv == np.argmax(
            self.y_test_iris, axis=1)) / self.y_test_iris.shape[0]
        logger.info(
            "Accuracy on Iris with universal adversarial examples: %.2f%%",
            (acc * 100))
Exemplo n.º 2
0
    def test_7_keras_iris_unbounded(self):
        classifier = get_tabular_classifier_kr()

        # Recreate a classifier without clip values
        classifier = KerasClassifier(model=classifier._model,
                                     use_logits=False,
                                     channels_first=True)
        attack_params = {
            "max_iter": 1,
            "attacker": "newtonfool",
            "attacker_params": {
                "max_iter": 5,
                "verbose": False
            }
        }
        attack = UniversalPerturbation(classifier, verbose=False)
        attack.set_params(**attack_params)
        x_test_iris_adv = attack.generate(self.x_test_iris)
        self.assertFalse((self.x_test_iris == x_test_iris_adv).all())

        preds_adv = np.argmax(classifier.predict(x_test_iris_adv), axis=1)
        self.assertFalse((np.argmax(self.y_test_iris,
                                    axis=1) == preds_adv).all())
        acc = np.sum(preds_adv == np.argmax(
            self.y_test_iris, axis=1)) / self.y_test_iris.shape[0]
        logger.info(
            "Accuracy on Iris with universal adversarial examples: %.2f%%",
            (acc * 100))
    def test_keras_mnist(self):
        """
        Second test with the KerasClassifier.
        :return:
        """
        x_test_original = self.x_test_mnist.copy()

        # Build KerasClassifier
        krc = get_image_classifier_kr()

        # Attack
        up = UniversalPerturbation(krc, max_iter=1, attacker="ead", attacker_params={"max_iter": 2, "targeted": False})
        x_train_adv = up.generate(self.x_train_mnist)
        self.assertTrue((up.fooling_rate >= 0.2) or not up.converged)

        x_test_adv = self.x_test_mnist + up.noise
        self.assertFalse((self.x_test_mnist == x_test_adv).all())

        train_y_pred = np.argmax(krc.predict(x_train_adv), axis=1)
        test_y_pred = np.argmax(krc.predict(x_test_adv), axis=1)
        self.assertFalse((np.argmax(self.y_test_mnist, axis=1) == test_y_pred).all())
        self.assertFalse((np.argmax(self.y_train_mnist, axis=1) == train_y_pred).all())

        # Check that x_test has not been modified by attack and classifier
        self.assertAlmostEqual(float(np.max(np.abs(x_test_original - self.x_test_mnist))), 0.0, delta=0.00001)
    def test_tensorflow_mnist(self):
        """
        First test with the TensorFlowClassifier.
        :return:
        """
        x_test_original = self.x_test_mnist.copy()

        # Build TensorFlowClassifier
        tfc, sess = get_image_classifier_tf()

        # Attack
        up = UniversalPerturbation(tfc, max_iter=1, attacker="newtonfool", attacker_params={"max_iter": 5})
        x_train_adv = up.generate(self.x_train_mnist)
        self.assertTrue((up.fooling_rate >= 0.2) or not up.converged)

        x_test_adv = self.x_test_mnist + up.noise
        self.assertFalse((self.x_test_mnist == x_test_adv).all())

        train_y_pred = np.argmax(tfc.predict(x_train_adv), axis=1)
        test_y_pred = np.argmax(tfc.predict(x_test_adv), axis=1)
        self.assertFalse((np.argmax(self.y_test_mnist, axis=1) == test_y_pred).all())
        self.assertFalse((np.argmax(self.y_train_mnist, axis=1) == train_y_pred).all())

        # Check that x_test has not been modified by attack and classifier
        self.assertAlmostEqual(float(np.max(np.abs(x_test_original - self.x_test_mnist))), 0.0, delta=0.00001)
    def test_pytorch_mnist(self):
        """
        Third test with the PyTorchClassifier.
        :return:
        """
        x_train_mnist = np.swapaxes(self.x_train_mnist, 1, 3).astype(np.float32)
        x_test_mnist = np.swapaxes(self.x_test_mnist, 1, 3).astype(np.float32)
        x_test_original = x_test_mnist.copy()

        # Build PyTorchClassifier
        ptc = get_image_classifier_pt()

        # Attack
        up = UniversalPerturbation(ptc, max_iter=1, attacker="newtonfool", attacker_params={"max_iter": 5})
        x_train_mnist_adv = up.generate(x_train_mnist)
        self.assertTrue((up.fooling_rate >= 0.2) or not up.converged)

        x_test_mnist_adv = x_test_mnist + up.noise
        self.assertFalse((x_test_mnist == x_test_mnist_adv).all())

        train_y_pred = np.argmax(ptc.predict(x_train_mnist_adv), axis=1)
        test_y_pred = np.argmax(ptc.predict(x_test_mnist_adv), axis=1)
        self.assertFalse((np.argmax(self.y_test_mnist, axis=1) == test_y_pred).all())
        self.assertFalse((np.argmax(self.y_train_mnist, axis=1) == train_y_pred).all())

        # Check that x_test has not been modified by attack and classifier
        self.assertAlmostEqual(float(np.max(np.abs(x_test_original - x_test_mnist))), 0.0, delta=0.00001)
    def test_6_keras_iris_clipped(self):
        classifier = get_tabular_classifier_kr()

        # Test untargeted attack
        attack_params = {
            "max_iter": 1,
            "attacker": "newtonfool",
            "attacker_params": {
                "max_iter": 5
            }
        }
        attack = UniversalPerturbation(classifier)
        attack.set_params(**attack_params)
        x_test_iris_adv = attack.generate(self.x_test_iris)
        self.assertFalse((self.x_test_iris == x_test_iris_adv).all())
        self.assertTrue((x_test_iris_adv <= 1).all())
        self.assertTrue((x_test_iris_adv >= 0).all())

        preds_adv = np.argmax(classifier.predict(x_test_iris_adv), axis=1)
        self.assertFalse((np.argmax(self.y_test_iris,
                                    axis=1) == preds_adv).all())
        acc = np.sum(preds_adv == np.argmax(
            self.y_test_iris, axis=1)) / self.y_test_iris.shape[0]
        logger.info(
            "Accuracy on Iris with universal adversarial examples: %.2f%%",
            (acc * 100))