def main(args): # select device here if torch.cuda.is_available(): os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus device_type = "gpu" else: device_type = "cpu" deepspeech = PyTorchDeepSpeech(pretrained_model=args.model, device_type=device_type) # load audio sample_rate, sound = wavfile.read(args.input) assert sample_rate == 16000, "This module only supports audio with sample rate of 16000 currently." # start prediction transcription = deepspeech.predict(np.array([sound]), batch_size=1, transcription_output=True) print("output:", transcription)
def test_pytorch_deep_speech(art_warning, expected_values, use_amp, device_type): # Only import if deepspeech_pytorch module is available import torch from art.estimators.speech_recognition.pytorch_deep_speech import PyTorchDeepSpeech try: # Load data for testing expected_data = expected_values() x1 = expected_data[0] x2 = expected_data[1] x3 = expected_data[2] expected_sizes = expected_data[3] expected_transcriptions1 = expected_data[4] expected_transcriptions2 = expected_data[5] expected_probs = expected_data[6] expected_gradients1 = expected_data[7] expected_gradients2 = expected_data[8] expected_gradients3 = expected_data[9] # Create signal data x = np.array([ np.array(x1 * 100, dtype=ART_NUMPY_DTYPE), np.array(x2 * 100, dtype=ART_NUMPY_DTYPE), np.array(x3 * 100, dtype=ART_NUMPY_DTYPE), ]) # Create labels y = np.array(["SIX", "HI", "GOOD"]) # Test probability outputs speech_recognizer = PyTorchDeepSpeech(pretrained_model="librispeech", device_type=device_type, use_amp=use_amp) probs, sizes = speech_recognizer.predict(x, batch_size=2) np.testing.assert_array_almost_equal(probs[1][1], expected_probs, decimal=3) np.testing.assert_array_almost_equal(sizes, expected_sizes) # Test transcription outputs transcriptions = speech_recognizer.predict(x, batch_size=2, transcription_output=True) assert (expected_transcriptions1 == transcriptions).all() # Test transcription outputs, corner case transcriptions = speech_recognizer.predict(np.array([x[0]]), batch_size=2, transcription_output=True) assert (expected_transcriptions2 == transcriptions).all() # Now test loss gradients # Compute gradients grads = speech_recognizer.loss_gradient(x, y) assert grads[0].shape == (1300, ) assert grads[1].shape == (1500, ) assert grads[2].shape == (1400, ) np.testing.assert_array_almost_equal(grads[0][0:20], expected_gradients1, decimal=-2) np.testing.assert_array_almost_equal(grads[1][0:20], expected_gradients2, decimal=-2) np.testing.assert_array_almost_equal(grads[2][0:20], expected_gradients3, decimal=-2) # Now test fit function # Create the optimizer parameters = speech_recognizer.model.parameters() speech_recognizer._optimizer = torch.optim.SGD(parameters, lr=0.01) # Before train transcriptions1 = speech_recognizer.predict(x, batch_size=2, transcription_output=True) # Train the estimator speech_recognizer.fit(x=x, y=y, batch_size=2, nb_epochs=5) # After train transcriptions2 = speech_recognizer.predict(x, batch_size=2, transcription_output=True) assert not ((transcriptions1 == transcriptions2).all()) except ARTTestException as e: art_warning(e)
class TestPyTorchDeepSpeech: """ This class tests the PyTorchDeepSpeech estimator. """ @pytest.fixture def setup_class(self): master_seed(seed=1234) # Small data for testing x1 = np.array( [ -1.0376293e-03, -1.0681478e-03, -1.0986663e-03, -1.1291848e-03, -1.1291848e-03, -1.1291848e-03, -1.1902219e-03, -1.1597034e-03, -1.1902219e-03, -1.1291848e-03, -1.1291848e-03, -1.0681478e-03, -9.1555528e-04, ] * 100 ) x2 = np.array( [ -1.8311106e-04, -1.2207404e-04, -6.1037019e-05, 0.0000000e00, 3.0518509e-05, 0.0000000e00, -3.0518509e-05, 0.0000000e00, 0.0000000e00, 9.1555528e-05, 2.1362957e-04, 3.3570360e-04, 4.2725913e-04, 4.5777764e-04, -1.8311106e-04, ] * 100 ) x3 = np.array( [ -8.2399976e-04, -7.0192572e-04, -5.4933317e-04, -4.2725913e-04, -3.6622211e-04, -2.7466659e-04, -2.1362957e-04, 5.4933317e-04, 5.7985168e-04, 6.1037019e-04, 6.7140721e-04, 7.0192572e-04, 6.7140721e-04, -1.5259255e-04, ] * 100 ) self.x = np.array([x1, x2, x3]) def test_all(self, _test_all): pass @pytest.fixture(params=[False, True]) def _test_all(self, request, setup_class): # Only import if deep speech module is available import torch from art.estimators.speech_recognition.pytorch_deep_speech import PyTorchDeepSpeech # Test probability outputs if request.param is True: self.speech_recognizer_amp = PyTorchDeepSpeech( pretrained_model="librispeech", device_type="gpu", use_amp=True ) probs, sizes = self.speech_recognizer_amp.predict(self.x, batch_size=2) else: self.speech_recognizer = PyTorchDeepSpeech(pretrained_model="librispeech") probs, sizes = self.speech_recognizer.predict(self.x, batch_size=2) expected_sizes = np.asarray([5, 5, 5]) np.testing.assert_array_almost_equal(sizes, expected_sizes) expected_probs = np.asarray( [ 1.0000000e00, 7.0154901e-14, 1.9170589e-13, 8.2194836e-13, 8.9967915e-13, 1.8518193e-12, 1.7883164e-10, 1.8951663e-12, 1.8818237e-13, 3.2806991e-12, 3.5664666e-16, 3.3147299e-14, 2.3439516e-13, 8.4845603e-12, 1.2017718e-13, 1.1180213e-12, 6.5572378e-15, 3.0194697e-12, 4.9065188e-15, 1.9765363e-13, 4.1670646e-11, 2.6884213e-12, 1.1436632e-13, 7.1931783e-15, 2.8135227e-11, 4.5599673e-14, 6.4587983e-13, 2.4159567e-15, 4.6668241e-13, ] ) np.testing.assert_array_almost_equal(probs[1][1], expected_probs, decimal=3) # Test transcription outputs if request.param is True: transcriptions = self.speech_recognizer_amp.predict(self.x, batch_size=2, transcription_output=True) else: transcriptions = self.speech_recognizer.predict(self.x, batch_size=2, transcription_output=True) expected_transcriptions = np.array(["", "", ""]) assert (expected_transcriptions == transcriptions).all() # Test transcription outputs, corner case if request.param is True: transcriptions = self.speech_recognizer_amp.predict( np.array([self.x[0]]), batch_size=2, transcription_output=True ) else: transcriptions = self.speech_recognizer.predict( np.array([self.x[0]]), batch_size=2, transcription_output=True ) expected_transcriptions = np.array([""]) assert (expected_transcriptions == transcriptions).all() # Now test loss gradients # Create labels y = np.array(["SIX", "HI", "GOOD"]) # Compute gradients if request.param is True: grads = self.speech_recognizer_amp.loss_gradient(self.x, y) else: grads = self.speech_recognizer.loss_gradient(self.x, y) assert grads[0].shape == (1300,) assert grads[1].shape == (1500,) assert grads[2].shape == (1400,) if request.param is True: expected_gradients1 = np.asarray( [ -3485.7, 659.0, -111.7, 283.6, 1691.9, 715.0, 1480.4, -3522.3, -4087.9, -8824.2, -304.7, 2013.4, -445.1, 4125.0, 1754.1, -503.6, 1160.0, 7051.7, -1992.2, 350.4, ] ) else: expected_gradients1 = np.asarray( [ -3482.77892371, 665.64673575, -116.24408896, 265.93803869, 1667.02236699, 688.33557577, 1455.14911883, -3524.90476617, -4082.06471587, -8802.39419605, -277.74274789, 2034.54679277, -428.53153241, 4114.63683848, 1722.53840709, -513.68916798, 1159.88786568, 7072.47761446, -1963.71829047, 382.65287411, ] ) np.testing.assert_array_almost_equal(grads[0][0:20], expected_gradients1, decimal=0) if request.param is True: expected_gradients2 = np.asarray( [ 20924.5, 3046.3, -7872.5, 15525.1, -15766.9, -18494.1, 19139.6, 6446.2, 26323.1, 4230.0, -31122.4, -2890.9, 12936.7, 13834.1, 17649.9, 8866.1, -16454.6, -6953.1, -17899.6, 4100.7, ] ) else: expected_gradients2 = np.asarray( [ 20992.44844133, 3048.78701634, -7849.13725934, 15557.59663939, -15760.10725159, -18422.9438386, 19132.22699435, 6508.51437337, 26292.5249963, 4232.62414548, -31128.82664215, -2894.85284984, 13008.74538039, 13845.08921681, 17657.67725957, 8807.42144017, -16477.89414508, -6977.8092622, -17914.22352666, 4086.51150059, ] ) np.testing.assert_array_almost_equal(grads[1][0:20], expected_gradients2, decimal=0) if request.param is True: expected_gradients3 = np.asarray( [ -1687.3, 6715.0, 16448.4, -3848.9, 16521.1, -15736.1, -26204.0, -8992.2, 9697.9, 13999.6, -7595.3, 14181.0, -24507.2, 5481.9, 7166.7, -6182.3, 2510.3, -7229.0, -10821.9, -11134.2, ] ) else: expected_gradients3 = np.asarray( [ -1693.10472689, 6711.39788693, 16480.14166546, -3786.95541286, 16448.3969823, -15702.45621671, -26162.89260564, -8979.81601681, 9657.87483965, 13955.78845296, -7552.01438108, 14170.60635269, -24434.37243957, 5502.81163675, 7171.56926943, -6154.06511686, 2483.93980406, -7244.24618697, -10798.70438903, -11129.57632319, ] ) np.testing.assert_array_almost_equal(grads[2][0:20], expected_gradients3, decimal=0) # Now test fit function if request.param is True: # Create the optimizer parameters = self.speech_recognizer_amp.model.parameters() self.speech_recognizer_amp._optimizer = torch.optim.SGD(parameters, lr=0.01) # Before train transcriptions1 = self.speech_recognizer_amp.predict(self.x, batch_size=2, transcription_output=True) # Train the estimator self.speech_recognizer_amp.fit(x=self.x, y=y, batch_size=2, nb_epochs=5) # After train transcriptions2 = self.speech_recognizer_amp.predict(self.x, batch_size=2, transcription_output=True) assert not ((transcriptions1 == transcriptions2).all()) else: # Create the optimizer parameters = self.speech_recognizer.model.parameters() self.speech_recognizer._optimizer = torch.optim.SGD(parameters, lr=0.01) # Before train transcriptions1 = self.speech_recognizer.predict(self.x, batch_size=2, transcription_output=True) # Train the estimator self.speech_recognizer.fit(x=self.x, y=y, batch_size=2, nb_epochs=5) # After train transcriptions2 = self.speech_recognizer.predict(self.x, batch_size=2, transcription_output=True) assert not ((transcriptions1 == transcriptions2).all())
def test_imperceptible_asr_pytorch(art_warning, expected_values, use_amp, device_type): # Only import if deepspeech_pytorch module is available import torch from art.estimators.speech_recognition.pytorch_deep_speech import PyTorchDeepSpeech from art.attacks.evasion.imperceptible_asr.imperceptible_asr_pytorch import ImperceptibleASRPyTorch from art.preprocessing.audio import LFilterPyTorch try: # Skip test if gpu is not available and use_amp is true if use_amp and not torch.cuda.is_available(): return # Load data for testing expected_data = expected_values() x1 = expected_data["x1"] x2 = expected_data["x2"] x3 = expected_data["x3"] # Create signal data x = np.array([ np.array(x1 * 200, dtype=ART_NUMPY_DTYPE), np.array(x2 * 200, dtype=ART_NUMPY_DTYPE), np.array(x3 * 200, dtype=ART_NUMPY_DTYPE), ]) # Create labels y = np.array(["S", "I", "GD"]) # Create DeepSpeech estimator with preprocessing numerator_coef = np.array( [0.0000001, 0.0000002, -0.0000001, -0.0000002], dtype=ART_NUMPY_DTYPE) denominator_coef = np.array([1.0, 0.0, 0.0, 0.0], dtype=ART_NUMPY_DTYPE) audio_filter = LFilterPyTorch(numerator_coef=numerator_coef, denominator_coef=denominator_coef, device_type=device_type) speech_recognizer = PyTorchDeepSpeech( pretrained_model="librispeech", device_type=device_type, use_amp=use_amp, preprocessing_defences=audio_filter, ) # Create attack asr_attack = ImperceptibleASRPyTorch( estimator=speech_recognizer, eps=0.001, max_iter_1=5, max_iter_2=5, learning_rate_1=0.00001, learning_rate_2=0.001, optimizer_1=torch.optim.Adam, optimizer_2=torch.optim.Adam, global_max_length=3200, initial_rescale=1.0, decrease_factor_eps=0.8, num_iter_decrease_eps=5, alpha=0.01, increase_factor_alpha=1.2, num_iter_increase_alpha=5, decrease_factor_alpha=0.8, num_iter_decrease_alpha=5, win_length=2048, hop_length=512, n_fft=2048, batch_size=2, use_amp=use_amp, opt_level="O1", ) # Test transcription output transcriptions_preprocessing = speech_recognizer.predict( x, batch_size=2, transcription_output=True) expected_transcriptions = np.array(["", "", ""]) assert (expected_transcriptions == transcriptions_preprocessing).all() # Generate attack x_adv_preprocessing = asr_attack.generate(x, y) # Test shape assert x_adv_preprocessing[0].shape == x[0].shape assert x_adv_preprocessing[1].shape == x[1].shape assert x_adv_preprocessing[2].shape == x[2].shape # Test content assert not (x_adv_preprocessing[0] == x[0]).all() assert not (x_adv_preprocessing[1] == x[1]).all() assert not (x_adv_preprocessing[2] == x[2]).all() assert np.sum(x_adv_preprocessing[0]) != np.inf assert np.sum(x_adv_preprocessing[1]) != np.inf assert np.sum(x_adv_preprocessing[2]) != np.inf assert np.sum(x_adv_preprocessing[0]) != 0 assert np.sum(x_adv_preprocessing[1]) != 0 assert np.sum(x_adv_preprocessing[2]) != 0 except ARTTestException as e: art_warning(e)
class AsrAttack(): ''' This class controls all the configuration and parameters, including parameters for attack and inference. The attack used here is from `Trusted-AI/adversarial-robustness-toolbox`. Check their github page for more information. TODO: Use modified version of the attack module written specifically for audio captcha. ''' SAMPLE_RATE = 16000 def __init__(self, pretrained_model="librispeech", gpus="0", debug=False, **attack_kwargs): ''' Create a class `.AsrAttack` instance. Args: pretrained_model (str) : The choice of target model. Currently this attack supports 3 different pretrained models consisting of `an4`, `librispeech` and `tedlium`, representing which dataset the model was trained with. gpus (str) : assign specific gpu to use. Default is "0". If gpu is unavailable, use cpu instead. debug (bool) : whether to print the debug message attack_kwargs (dict) : arguments for attack parameters. Read the documentation below. Args for `attack_kwargs`: estimator (PyTorchDeepSpeech) : A trained estimator. initial_eps (float) : Initial maximum perturbation that the attacker can introduce. max_iter_1st_stage (int): The maximum number of iterations applied for the first stage of the optimization of the attack. max_iter_2nd_stage (int): The maximum number of iterations applied for the second stage of the optimization of the attack. learning_rate_1st_stage (float) : The initial learning rate applied for the first stage of the optimization of the attack. learning_rate_2nd_stage (float) : The initial learning rate applied for the second stage of the optimization of the attack. optimizer_1st_stage: The optimizer applied for the first stage of the optimization of the attack. If `None` attack will use `torch.optim.SGD`. optimizer_2nd_stage: The optimizer applied for the second stage of the optimization of the attack. If `None` attack will use `torch.optim.SGD`. global_max_length (int) : The length of the longest audio signal allowed by this attack. initial_rescale (float) : Initial rescale coefficient to speedup the decrease of the perturbation size during the first stage of the optimization of the attack. rescale_factor (float) : The factor to adjust the rescale coefficient during the first stage of the optimization of the attack. num_iter_adjust_rescale (int) : Number of iterations to adjust the rescale coefficient. initial_alpha (float) : The initial value of the alpha coefficient used in the second stage of the optimization of the attack. increase_factor_alpha (float) : The factor to increase the alpha coefficient used in the second stage of the optimization of the attack. num_iter_increase_alpha (int) : Number of iterations to increase alpha. decrease_factor_alpha (float) : The factor to decrease the alpha coefficient used in the second stage of the optimization of the attack. num_iter_decrease_alpha (int) : Number of iterations to decrease alpha. batch_size (int) : Size of the batch on which adversarial samples are generated. use_amp (bool) : Whether to use the automatic mixed precision tool to enable mixed precision training or gradient computation, e.g. with loss gradient computation. When set to True, this option is only triggered if there are GPUs available. opt_level (str) : Specify a pure or mixed precision optimization level. Used when use_amp is True. Accepted values are `O0`, `O1`, `O2`, and `O3`. ''' self.pretrained_model = pretrained_model self.gpus = gpus self.debug = debug self.attack_kwargs = attack_kwargs # set gpu device here if torch.cuda.is_available(): os.environ["CUDA_VISIBLE_DEVICES"] = self.gpus self.device_type = "gpu" else: self.device_type = "cpu" # TODO : Set up optimizer in `attack_kwargs` # initialize target asr model self.asr_model = PyTorchDeepSpeech(pretrained_model=self.pretrained_model, device_type=self.device_type) # attack! self.asr_attack = ImperceptibleASRPyTorch(estimator=self.asr_model, **self.attack_kwargs) def load_audio(self, path): ''' It's the same loader used by deepspeech-pytorch ''' sound, _ = librosa.load(path, sr=AsrAttack.SAMPLE_RATE) if len(sound.shape) > 1: sound = sound.mean(axis=1) # multiple channels, average return sound def save_audio(self, path, audio): ''' Save audio file. Will be rescaled in 16-bits integer. ''' wavfile.write(path, AsrAttack.SAMPLE_RATE, audio) def generate_adv_example(self, input_path, target, output_path): ''' Generate adversarial example. Args: input_path (str) : the path of audio being attacked. target (str) : target output in capital letter. Ex: "OPEN THE DOOR". output_path (str) : the path where targeted audio is stored. ''' audio = self.load_audio(input_path) prediction = self.asr_model.predict(np.array([audio]), batch_size=1, transcription_output=True) if self.debug: print('input path:', input_path) print('original prediction:', prediction) print('target:', target) # start generating adv example adv_audio = self.asr_attack.generate(np.array([audio]), np.array([target]), batch_size=1) # check the transcription of targeted audio adv_transcriptions = self.asr_model.predict(adv_audio, batch_size=1, transcription_output=True) print("Groundtruth transcriptions: ", prediction) print("Target transcriptions: ", target) print("Adversarial transcriptions: ", adv_transcriptions) # save adv audio self.save_audio(output_path, adv_audio[0]) if self.debug: print('Generated audio stored at:', output_path)
def test_imperceptible_asr_pytorch(art_warning, expected_values, use_amp, device_type): # Only import if deepspeech_pytorch module is available import torch from art.estimators.speech_recognition.pytorch_deep_speech import PyTorchDeepSpeech from art.attacks.evasion.imperceptible_asr.imperceptible_asr_pytorch import ImperceptibleASRPyTorch from art.defences.preprocessor import LFilterPyTorch try: # Load data for testing expected_data = expected_values() x1 = expected_data[0] x2 = expected_data[1] x3 = expected_data[2] # Create signal data x = np.array([ np.array(x1 * 100, dtype=ART_NUMPY_DTYPE), np.array(x2 * 100, dtype=ART_NUMPY_DTYPE), np.array(x3 * 100, dtype=ART_NUMPY_DTYPE), ]) # Create labels y = np.array(["S", "I", "GD"]) # Create DeepSpeech estimator with preprocessing numerator_coef = np.array( [0.0000001, 0.0000002, -0.0000001, -0.0000002]) denominator_coef = np.array([1.0, 0.0, 0.0, 0.0]) audio_filter = LFilterPyTorch(numerator_coef=numerator_coef, denominator_coef=denominator_coef, device_type=device_type) speech_recognizer = PyTorchDeepSpeech( pretrained_model="librispeech", device_type=device_type, use_amp=use_amp, preprocessing_defences=audio_filter, ) # Create attack asr_attack = ImperceptibleASRPyTorch( estimator=speech_recognizer, initial_eps=0.001, max_iter_1st_stage=5, max_iter_2nd_stage=5, learning_rate_1st_stage=0.00001, learning_rate_2nd_stage=0.001, optimizer_1st_stage=torch.optim.SGD, optimizer_2nd_stage=torch.optim.SGD, global_max_length=2000, initial_rescale=1.0, rescale_factor=0.8, num_iter_adjust_rescale=5, initial_alpha=0.01, increase_factor_alpha=1.2, num_iter_increase_alpha=5, decrease_factor_alpha=0.8, num_iter_decrease_alpha=5, batch_size=2, use_amp=use_amp, opt_level="O1", ) # Test transcription output transcriptions_preprocessing = speech_recognizer.predict( x, batch_size=2, transcription_output=True) expected_transcriptions = np.array(["", "", ""]) assert (expected_transcriptions == transcriptions_preprocessing).all() # Generate attack x_adv_preprocessing = asr_attack.generate(x, y) # Test shape assert x_adv_preprocessing[0].shape == x[0].shape assert x_adv_preprocessing[1].shape == x[1].shape assert x_adv_preprocessing[2].shape == x[2].shape except ARTTestException as e: art_warning(e)
def _test_all(self, request, setup_class): # Only import if deep speech module is available from art.estimators.speech_recognition.pytorch_deep_speech import PyTorchDeepSpeech from art.attacks.evasion.imperceptible_asr.imperceptible_asr_pytorch import ImperceptibleASRPytorch # Without amp if request.param is False: # Create DeepSpeech estimator speech_recognizer = PyTorchDeepSpeech(pretrained_model="librispeech") # Create attack asr_attack = ImperceptibleASRPytorch( estimator=speech_recognizer, initial_eps=0.001, max_iter_1st_stage=50, max_iter_2nd_stage=50, learning_rate_1st_stage=0.00001, learning_rate_2nd_stage=0.001, optimizer_1st_stage=torch.optim.SGD, optimizer_2nd_stage=torch.optim.SGD, global_max_length=2000, initial_rescale=1.0, rescale_factor=0.8, num_iter_adjust_rescale=5, initial_alpha=0.01, increase_factor_alpha=1.2, num_iter_increase_alpha=5, decrease_factor_alpha=0.8, num_iter_decrease_alpha=5, batch_size=2, use_amp=False, opt_level="O1", loss_scale=1, ) # With amp else: # Create DeepSpeech estimator speech_recognizer = PyTorchDeepSpeech(pretrained_model="librispeech", device_type="gpu", use_amp=True) # Create attack asr_attack = ImperceptibleASRPytorch( estimator=speech_recognizer, initial_eps=0.001, max_iter_1st_stage=50, max_iter_2nd_stage=50, learning_rate_1st_stage=0.00001, learning_rate_2nd_stage=0.001, optimizer_1st_stage=torch.optim.SGD, optimizer_2nd_stage=torch.optim.SGD, global_max_length=2000, initial_rescale=1.0, rescale_factor=0.8, num_iter_adjust_rescale=2, initial_alpha=0.01, increase_factor_alpha=1.2, num_iter_increase_alpha=2, decrease_factor_alpha=0.8, num_iter_decrease_alpha=2, batch_size=2, use_amp=True, opt_level="O1", loss_scale=1, ) # Test transcription output transcriptions = speech_recognizer.predict(self.x, batch_size=2, transcription_output=True) expected_transcriptions = np.array(["", "", ""]) assert (expected_transcriptions == transcriptions).all() # Generate attack x_adv = asr_attack.generate(self.x, self.y) # Test shape for i in range(3): assert x_adv[i].shape == self.x[i].shape