def test_supported_modes_property(self): augment = Compose(transforms=[ PeakNormalization(p=1.0), ], ) assert augment.supported_modes == { "per_batch", "per_example", "per_channel" } augment = Compose( transforms=[PeakNormalization(p=1.0), ShuffleChannels(p=1.0)], ) assert augment.supported_modes == {"per_example"}
def test_shuffle(self): random.seed(42) samples = np.array([[[1.0, 0.5, -0.25, -0.125, 0.0]]], dtype=np.float32) sample_rate = 16000 augment = Compose( transforms=[ Gain(min_gain_in_db=-18.0, max_gain_in_db=-16.0, p=1.0), PeakNormalization(p=1.0), ], shuffle=True, output_type="dict", ) num_peak_normalization_last = 0 num_gain_last = 0 for i in range(100): processed_samples = augment( samples=torch.from_numpy(samples), sample_rate=sample_rate).samples.numpy() # Either PeakNormalization or Gain was applied last if processed_samples[0, 0, 0] < 0.2: num_gain_last += 1 elif processed_samples[0, 0, 0] == 1.0: num_peak_normalization_last += 1 else: raise AssertionError("Unexpected value!") self.assertGreater(num_peak_normalization_last, 10) self.assertGreater(num_gain_last, 10)
def training_step(self, batch, batch_nb): mix, source = batch apply_augmentation = Compose(transforms=[ Gain(min_gain_in_db=-15.0, max_gain_in_db=5.0, p=0.5, mode="per_channel") ]) source = apply_augmentation(source, sample_rate=22050) loss = self.common_step((mix, source), batch_nb, train=True) self.log("loss", loss, logger=True) return loss
def test_freeze_and_unfreeze_parameters(self): torch.manual_seed(42) samples = np.array([[[1.0, 0.5, -0.25, -0.125, 0.0]]], dtype=np.float32) sample_rate = 16000 augment = Compose( transforms=[ Gain( min_gain_in_db=-16.000001, max_gain_in_db=-2, p=1.0, ), PolarityInversion(p=1.0), ], output_type="dict", ) processed_samples1 = augment(samples=torch.from_numpy(samples), sample_rate=sample_rate).samples.numpy() augment.freeze_parameters() processed_samples2 = augment(samples=torch.from_numpy(samples), sample_rate=sample_rate).samples.numpy() assert_array_equal(processed_samples1, processed_samples2) augment.unfreeze_parameters() processed_samples3 = augment(samples=torch.from_numpy(samples), sample_rate=sample_rate).samples.numpy() self.assertNotEqual(processed_samples1[0, 0, 0], processed_samples3[0, 0, 0])
def test_splice_out_odd_hann(self): audio_samples = torch.rand(size=(8, 1, 32000), dtype=torch.float32) augment = Compose( [ SpliceOut( num_time_intervals=10, max_width=400, output_type="dict"), ], output_type="dict", ) splice_out_samples = augment(samples=audio_samples, sample_rate=16100).samples.numpy() assert splice_out_samples.dtype == np.float32
def test_compose_with_p_zero(self): samples = np.array([[[1.0, 0.5, -0.25, -0.125, 0.0]]], dtype=np.float32) sample_rate = 16000 augment = Compose( transforms=[ Gain(min_gain_in_db=-6.000001, max_gain_in_db=-6, p=1.0), PolarityInversion(p=1.0), ], p=0.0, ) processed_samples = augment(samples=torch.from_numpy(samples), sample_rate=sample_rate).numpy() assert_array_equal(samples, processed_samples)
def test_splice_out_multichannel(self): audio_samples = torch.rand(size=(8, 2, 32000), dtype=torch.float32) augment = Compose( [ SpliceOut( num_time_intervals=10, max_width=400, output_type="dict"), ], output_type="dict", ) splice_out_samples = augment(samples=audio_samples, sample_rate=16000).samples.numpy() assert splice_out_samples.dtype == np.float32 self.assertLess(splice_out_samples.sum(), audio_samples.numpy().sum()) self.assertEqual(splice_out_samples.shape, audio_samples.shape)
def test_compose_with_torchaudio_transform(self): samples = np.array([[[1.0, 0.5, -0.25, -0.125, 0.0]]], dtype=np.float32) sample_rate = 16000 augment = Compose( [Vol(gain=-6, gain_type="db"), PolarityInversion(p=1.0)]) processed_samples = augment(samples=torch.from_numpy(samples), sample_rate=sample_rate).numpy() expected_factor = -convert_decibels_to_amplitude_ratio(-6) assert_almost_equal( processed_samples, expected_factor * np.array([[[1.0, 0.5, -0.25, -0.125, 0.0]]], dtype=np.float32), decimal=6, ) self.assertEqual(processed_samples.dtype, np.float32)
def test_splice_out_cuda(self): audio_samples = (torch.rand(size=(8, 1, 32000), dtype=torch.float32, device=torch.device("cuda")) - 0.5) augment = Compose( [ SpliceOut( num_time_intervals=10, max_width=400, output_type="dict"), ], output_type="dict", ) splice_out_samples = (augment(samples=audio_samples, sample_rate=16000).samples.cpu().numpy()) assert splice_out_samples.dtype == np.float32 self.assertLess(splice_out_samples.sum(), audio_samples.cpu().numpy().sum())
def test_compose(self): samples = np.array([[[1.0, 0.5, -0.25, -0.125, 0.0]]], dtype=np.float32) sample_rate = 16000 augment = Compose([ Gain(min_gain_in_db=-6.000001, max_gain_in_db=-6, p=1.0), PolarityInversion(p=1.0), ]) processed_samples = augment(samples=torch.from_numpy(samples), sample_rate=sample_rate).numpy() expected_factor = -convert_decibels_to_amplitude_ratio(-6) assert_almost_equal( processed_samples, expected_factor * np.array([[[1.0, 0.5, -0.25, -0.125, 0.0]]], dtype=np.float32), decimal=6, ) self.assertEqual(processed_samples.dtype, np.float32)
def training_step(self, batch, batch_nb): apply_augmentation = Compose( transforms=[ Gain( min_gain_in_db=-15.0, max_gain_in_db=5.0, p=0.5, mode="per_channel" ), ShuffleChannels( mode="per_example" ), PitchShift(min_transpose_semitones=-2, max_transpose_semitones=2, p=0.5, mode="per_example",sample_rate=44100), ] ) batch = apply_augmentation(batch, sample_rate=44100) loss = self.common_step(batch, batch_nb, train=True) self.log("loss", loss, logger=True) return loss
def test_compose_without_specifying_output_type(self): samples = np.array([[[1.0, 0.5, -0.25, -0.125, 0.0]]], dtype=np.float32) sample_rate = 16000 augment = Compose([ Gain(min_gain_in_db=-6.000001, max_gain_in_db=-6, p=1.0), PolarityInversion(p=1.0), ]) processed_samples = augment(samples=torch.from_numpy(samples), sample_rate=sample_rate) # This dtype should be torch.Tensor until we switch to ObjectDict as default assert type(processed_samples) == torch.Tensor processed_samples = processed_samples.numpy() expected_factor = -convert_decibels_to_amplitude_ratio(-6) assert_almost_equal( processed_samples, expected_factor * np.array([[[1.0, 0.5, -0.25, -0.125, 0.0]]], dtype=np.float32), decimal=6, ) self.assertEqual(processed_samples.dtype, np.float32)
) BG_NOISE_PATH = TEST_FIXTURES_DIR / "bg" IR_PATH = TEST_FIXTURES_DIR / "ir" @pytest.mark.parametrize( "augment", [ # Differentiable transforms: AddBackgroundNoise(BG_NOISE_PATH, 20, p=1.0, output_type="dict"), ApplyImpulseResponse(IR_PATH, p=1.0, output_type="dict"), Compose( transforms=[ Gain(min_gain_in_db=-15.0, max_gain_in_db=5.0, p=1.0), PolarityInversion(p=1.0), ], output_type="dict", ), Gain(min_gain_in_db=-6.000001, max_gain_in_db=-6, p=1.0, output_type="dict"), PolarityInversion(p=1.0, output_type="dict"), Shift(p=1.0, output_type="dict"), # Non-differentiable transforms: # RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: # [torch.DoubleTensor [1, 1, 5]], which is output 0 of IndexBackward, is at version 1; expected version 0 instead. # Hint: enable anomaly detection to find the operation that failed to compute its gradient, # with torch.autograd.set_detect_anomaly(True). pytest.param(
LowPassFilter, HighPassFilter, ) BG_NOISE_PATH = TEST_FIXTURES_DIR / "bg" IR_PATH = TEST_FIXTURES_DIR / "ir" @pytest.mark.parametrize( "augment", [ # Differentiable transforms: AddBackgroundNoise(BG_NOISE_PATH, 20, p=1.0), ApplyImpulseResponse(IR_PATH, p=1.0), Compose(transforms=[ Gain(min_gain_in_db=-15.0, max_gain_in_db=5.0, p=1.0), PolarityInversion(p=1.0), ]), Gain(min_gain_in_db=-6.000001, max_gain_in_db=-6, p=1.0), PolarityInversion(p=1.0), Shift(p=1.0), # Non-differentiable transforms: # RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: # [torch.DoubleTensor [1, 1, 5]], which is output 0 of IndexBackward, is at version 1; expected version 0 instead. # Hint: enable anomaly detection to find the operation that failed to compute its gradient, # with torch.autograd.set_detect_anomaly(True). pytest.param(HighPassFilter(p=1.0), marks=pytest.mark.skip("Not differentiable")), pytest.param(LowPassFilter(p=1.0), marks=pytest.mark.skip("Not differentiable")), pytest.param(PeakNormalization(p=1.0), marks=pytest.mark.skip("Not differentiable")),
background_paths=TEST_FIXTURES_DIR / "bg", mode=mode, p=1.0 ), "num_runs": 5, }, { "instance": ApplyImpulseResponse( ir_paths=TEST_FIXTURES_DIR / "ir", mode=mode, p=1.0 ), "num_runs": 1, }, { "instance": Compose( transforms=[ Gain( min_gain_in_db=-18.0, max_gain_in_db=-16.0, mode=mode, p=1.0 ), PeakNormalization(mode=mode, p=1.0), ], shuffle=True, ), "name": "Shuffled Compose with Gain and PeakNormalization", "num_runs": 5, }, { "instance": Compose( transforms=[ Gain( min_gain_in_db=-18.0, max_gain_in_db=-16.0, mode=mode, p=0.5 ), PolarityInversion(mode=mode, p=0.5), ],
import torch from torch_audiomentations import Compose, Gain, PolarityInversion import torchaudio # Initialize augmentation callable apply_augmentation = Compose(transforms=[ Gain( min_gain_in_db=-15.0, max_gain_in_db=40.0, p=1.0, ), PolarityInversion(p=0.0) ]) torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Make an example tensor with white noise. # This tensor represents 8 audio snippets with 2 channels (stereo) and 2 s of 16 kHz audio. # audio_samples = torch.rand(size=(8, 2, 32000), dtype=torch.float32, device=torch_device) - 0.5 audio_samples = torchaudio.load( "/Users/bdubel/Documents/ZHAW/BA/data/eth_ch_dialects/ag/ch_ag_0107.wav") # Apply augmentation. This varies the gain and polarity of (some of) # the audio snippets in the batch independently. perturbed_audio_samples = apply_augmentation(audio_samples[0], sample_rate=16000) torchaudio.save( '/Users/bdubel/Documents/ZHAW/BA/data/swiss_all/perturbation/test1.flac', perturbed_audio_samples, sample_rate=16000)