Exemplo n.º 1
0
    def __call__(self, x, sr):
        #Draw reverb, snr and pitch.

        flips = np.random.binomial(1, p=self.p, size=3)

        if flips[0]:
            #Add reverb
            reverb = np.random.randint(0, self.reverb)
            x = augment.EffectChain().reverb(reverb, reverb,
                                             reverb).channels(2).apply(
                                                 x, src_info={'rate': sr})

        if flips[1]:
            #Add noise
            noise_generator = lambda: torch.zeros_like(x).uniform_()
            x = augment.EffectChain().additive_noise(noise_generator,
                                                     snr=self.snr).apply(
                                                         x,
                                                         src_info={'rate': sr})

        if flips[2]:
            #Add pitch - PITCH CAN SOMEHOW CHANGE THE SHAPE. NOT SURE HOW, WHY ETC.
            pitch = np.random.randint(-self.pitch, self.pitch)
            x = augment.EffectChain().pitch(pitch).rate(sr).apply(
                x, src_info={'rate': sr})

        return x, sr
Exemplo n.º 2
0
def test_additive_noise():
    x, sr = torchaudio.load(test_wav)

    noise = torch.zeros_like(x)

    src_info = {
        'channels': 1,
        'length': x.size(1),
        'precision': 32,
        'rate': 16000.0,
        'bits_per_sample': 32
    }

    target_info = {
        'channels': 1,
        'length': 0,
        'precision': 32,
        'rate': 16000.0,
        'bits_per_sample': 32
    }

    y = augment.EffectChain() \
            .additive_noise(noise_generator=lambda: x, snr=10.0) \
            .apply(x, src_info=src_info, target_info=target_info)

    assert torch.allclose(x, y)
Exemplo n.º 3
0
def test_non_empty_chain():
    x, sr = torchaudio.load(test_wav)

    src_info = {
        'channels': 1,
        'length': x.size(1),
        'precision': 32,
        'rate': 16000.0,
        'bits_per_sample': 32
    }

    target_info = {
        'channels': 1,
        'length': 0,
        'precision': 32,
        'rate': 16000.0,
        'bits_per_sample': 32
    }

    effects = augment.EffectChain().bandreject(1, 20000)

    y = effects.apply(x, src_info=src_info, target_info=target_info)

    assert x.size() == y.size(), f'{y.size()}'
    assert not x.allclose(y)
Exemplo n.º 4
0
def augmentation_factory(description, sampling_rate, args):
    chain = augment.EffectChain()
    description = description.split(',')

    for effect in description:
        if effect == 'bandreject':
            chain = chain.sinc(
                '-a', '120', SpecAugmentBand(sampling_rate, args.band_scaler))
        elif effect == 'pitch':
            pitch_randomizer = RandomPitchShift(args.pitch_shift_max)
            if args.pitch_quick:
                chain = chain.pitch('-q', pitch_randomizer).rate(
                    '-q', sampling_rate)
            else:
                chain = chain.pitch(pitch_randomizer).rate(sampling_rate)
        elif effect == 'reverb':
            randomized_params = RandomReverb(
                args.reverberance_min, args.reverberance_max, args.damping_min,
                args.damping_max, args.room_scale_min, args.room_scale_max)
            chain = chain.reverb(randomized_params).channels()
        elif effect == 'time_drop':
            chain = chain.time_dropout(max_seconds=args.t_ms / 1000.0)
        elif effect == 'clip':
            chain = chain.clip(RandomClipFactor(args.clip_min, args.clip_max))
        elif effect == 'none':
            pass
        else:
            raise RuntimeError(f'Unknown augmentation type {effect}')
    return chain
Exemplo n.º 5
0
def test_stochastic_pitch():
    x, sr = torchaudio.load(test_wav)

    assert sr == 16000

    src_info = {
        'channels': x.size(0),
        'length': x.size(1),
        'precision': 32,
        'rate': 16000.0,
        'bits_per_sample': 32
    }

    target_info = {
        'channels': 1,
        'length': 0,
        'precision': 32,
        'rate': 16000.0,
        'bits_per_sample': 32
    }

    def random_pitch():
        return np.random.randint(100, 500)

    y = augment.EffectChain().pitch(random_pitch).rate(16000).apply(
        x, src_info=src_info, target_info=target_info)
    assert not torch.allclose(x, y, rtol=1e-3, atol=1e-3)
Exemplo n.º 6
0
def reverb(*args, **kwargs):
    """
    Returns a reverb effect for wav augmentation.
    """
    import augment
    effect_chain = augment.EffectChain()
    # Reverb it makes the signal to have two channels,
    # which we combine into 1 by running `channels` w/o parameters
    effect_chain.reverb(50, 50, _random_room_size).channels()
    return effect_chain
Exemplo n.º 7
0
def test_bandreject():
    y1, _ = run_sox_command(test_wav, ["sinc", "-a", "120", "2000-1000"])

    chain = augment.EffectChain().sinc("-a", "120", "2000-1000")
    y2 = apply_chain(test_wav, chain)

    assert y1.size() == y2.size()

    # NB: higher tolerance due to all the discretization done on save/load
    assert torch.allclose(y1, y2, rtol=1e-4, atol=1e-4)
Exemplo n.º 8
0
def test_reverb():
    y1, _ = run_sox_command(test_wav, ["reverb", "50", "50", "100"])

    chain = augment.EffectChain().reverb(50, 50, 100).channels()
    y2 = apply_chain(test_wav, chain)

    assert y1.size() == y2.size()

    # NB: higher tolerance due to all the discretization done on save/load
    assert torch.allclose(y1, y2, rtol=1e-4, atol=1e-4)
Exemplo n.º 9
0
def test_pitch():
    y1, _ = run_sox_command(test_wav, ["pitch", "-100"])

    chain = augment.EffectChain().pitch(-100).rate(16000)
    y2 = apply_chain(test_wav, chain)

    assert y1.size() == y2.size()

    # NB: higher tolerance due to all the discretization done on save/load
    assert torch.allclose(y1, y2, rtol=1e-4, atol=1e-4)
Exemplo n.º 10
0
def pitch(sampling_rate: int):
    """
    Returns a pitch modification effect for wav augmentation.

    :param sampling_rate: a sampling rate value for which the effect will be created (resampling is needed for pitch).
    """
    import augment
    effect_chain = augment.EffectChain()
    # The pitch effect changes the sampling ratio; we have to compensate for that.
    # Here, we specify 'quick' options on both pitch and rate effects, to speed up things
    effect_chain.pitch("-q", _random_pitch_shift).rate("-q", sampling_rate)
    return effect_chain
Exemplo n.º 11
0
def test_empty_chain():
    x = torch.arange(0, 8000).float()

    src_info = {'channels': 1,
                'length': x.size(0),
                'precision': 32,
                'rate': 16000.0,
                'bits_per_sample': 32}

    target_info = {'channels': 1,
                   'length': 0,
                   'precision': 32,
                   'rate': 16000.0,
                   'bits_per_sample': 32}

    y = augment.EffectChain().apply(
        x, src_info=src_info, target_info=target_info)

    assert x.view(-1).allclose(y.view(-1))
    def forward(self, audio):
        reverberance = torch.randint(self.reverberance_min,
                                     self.reverberance_max,
                                     size=(1, )).item()
        dumping_factor = torch.randint(self.dumping_factor_min,
                                       self.dumping_factor_max,
                                       size=(1, )).item()
        room_size = torch.randint(self.room_size_min,
                                  self.room_size_max,
                                  size=(1, )).item()
        effect_chain = (augment.EffectChain().reverb(reverberance,
                                                     dumping_factor,
                                                     room_size).channels(1))

        audio = effect_chain.apply(audio,
                                   src_info=self.src_info,
                                   target_info=self.target_info)

        return audio
Exemplo n.º 13
0
def pitch_reverb_tdrop(sampling_rate: int):
    """
    Returns an effect chain composed of pitch modification, reverberation and time dropout proposed in:

    * https://github.com/facebookresearch/WavAugment/blob/master/examples/python/librispeech_selfsupervised.py#L152
    * https://arxiv.org/abs/2007.00991

    :param sampling_rate: a sampling rate value for which the effect will be created (resampling is needed for pitch).
    """
    import augment
    effect_chain = augment.EffectChain()
    # The pitch effect changes the sampling ratio; we have to compensate for that.
    # Here, we specify 'quick' options on both pitch and rate effects, to speed up things
    effect_chain.pitch("-q", _random_pitch_shift).rate("-q", sampling_rate)
    # Next effect we add is `reverb`; it adds makes the signal to have two channels,
    # which we combine into 1 by running `channels` w/o parameters
    effect_chain.reverb(50, 50, _random_room_size).channels()
    # Futher, we add an effect that randomly drops one 50ms subsequence
    effect_chain.time_dropout(max_seconds=50 / 1000)
    return effect_chain
Exemplo n.º 14
0
def convert_pitch_augment(test_wav):
    x, sr = torchaudio.load(test_wav)

    assert sr == 16000

    src_info = {'channels': x.size(0),
                'length': x.size(1),
                'precision': 32,
                'rate': 16000.0,
                'bits_per_sample': 32}

    target_info = {'channels': 1,
                   'length': 0,
                   'precision': 32,
                   'rate': 16000.0,
                   'bits_per_sample': 32}

    y = augment.EffectChain().pitch(100).rate(16000).apply(
        x, src_info=src_info, target_info=target_info)
    return y, sr
Exemplo n.º 15
0
    def __call__(self, audio):
        n_steps = random.randint(self.pitch_cents_min, self.pitch_cents_max)
        effect_chain = augment.EffectChain().pitch(n_steps).rate(self.sample_rate)

        y = effect_chain.apply(
            audio, src_info=self.src_info, target_info=self.target_info
        )

        # sox might misbehave sometimes by giving nan/inf if sequences are too short (or silent)
        # and the effect chain includes eg `pitch`
        if torch.isnan(y).any() or torch.isinf(y).any():
            return audio.clone()

        if y.shape[1] != audio.shape[1]:
            if y.shape[1] > audio.shape[1]:
                y = y[:, audio.shape[1]]
            else:
                y0 = torch.zeros(1, audio.shape[1])
                y0[:, :y.shape[1]] = y
                y = y0
        return y
Exemplo n.º 16
0
    def __init__(
        self,
        manifest_path,
        sample_rate,
        max_sample_size=None,
        min_sample_size=None,
        shuffle=True,
        min_length=0,
        pad=False,
        normalize=False,
    ):
        super(AugmentedFileAudioDataset, self).__init__(
            manifest_path=manifest_path,
            sample_rate=sample_rate,
            max_sample_size=max_sample_size,
            min_sample_size=min_sample_size,
            shuffle=shuffle,
            min_length=min_length,
            pad=pad,
            normalize=normalize,
        )

        self.pre_transform = Compose([
            #AddGaussianNoise(min_amplitude=1e-3, max_amplitude=5e-2, p=0.8),
            #PitchShift(min_semitones=-4, max_semitones=4, p=0.8),
            FrequencyMask(min_frequency_band=0.0,
                          max_frequency_band=0.05,
                          p=0.5),
            TimeMask(min_band_part=0.0, max_band_part=0.05, p=0.5)
            #ClippingDistortion(min_percentile_threshold=10, max_percentile_threshold=40, p=0.2),
        ])

        random_reverb = RandomReverb()
        random_clip = RandomClip()
        random_time_dropout = RandomTimeDropout()
        self.post_transform = augment.EffectChain().reverb(
            random_reverb).channels(1).clip(random_clip)  #.time_dropout(200)
Exemplo n.º 17
0
            "train-clean-360", "train-other-500"], default='dev-clean', help='Librispeech subset to use')
    parser.add_argument('--sequence_length_seconds', type=int, default=1, help='Sample sequence length')
    parser.add_argument('--batch_size', type=int, default=32, help="Batch size")
    parser.add_argument('--n_workers', type=int, default=8, help="Number of parallel workers to read/preprocess data")
    parser.add_argument('--n_epochs', type=int, default=3, help="Number of epochs to run")
    parser.add_argument('--dump', action="store_true", help="Dump examples of (non)augmented sequences."
                                    "They would be saved in 'original.wav' and 'augmented.wav'")


    args = parser.parse_args()
    return args

if __name__ == '__main__':
    args = get_args()

    effect_chain_past = augment.EffectChain()
    # The pitch effect changes the sampling ratio; we have to compensate for that.
    # Here, we specify 'quick' options on both pitch and rate effects, to speed up things
    effect_chain_past.pitch("-q", random_pitch_shift).rate("-q", 16_000)
    # Next effect we add is `reverb`; it adds makes the signal to have two channels,
    # which we combine into 1 by running `channels` w/o parameters
    effect_chain_past.reverb(50, 50, random_room_size).channels()
    # Futher, we add an effect that randomly drops one 50ms subsequence
    effect_chain_past.time_dropout(max_seconds=50 / 1000)

    effect_chain_past_runner = ChainRunner(effect_chain_past)

    # the second, `future` copy would be non-augmented
    effect_chain_future = None
    effect_chain_future_runner = None