示例#1
0
def mfcc_input_pipeline(source):
    data = fn.external_source(source=source)
    spectrum = fn.spectrogram(data, nfft=60, window_length=50, window_step=25)
    mel = fn.mel_filter_bank(spectrum)
    dec = fn.to_decibels(mel)

    return dec
 def pipe(max_batch_size, input_data, device):
     pipe = Pipeline(batch_size=max_batch_size, num_threads=4, device_id=0)
     with pipe:
         data = fn.external_source(source=input_data, cycle=False, device=device)
         spectrum = fn.spectrogram(data, nfft=60, window_length=50, window_step=25)
         processed = fn.mel_filter_bank(spectrum)
         pipe.set_outputs(processed)
     return pipe
示例#3
0
def test_mel_filter_bank_cpu():
    pipe = Pipeline(batch_size=batch_size, num_threads=4, device_id=None)
    data = fn.external_source(source=get_audio_data)
    spectrum = fn.spectrogram(data, nfft=60, window_length=50, window_step=25)
    processed = fn.mel_filter_bank(spectrum)
    pipe.set_outputs(processed)
    pipe.build()
    for _ in range(3):
        pipe.run()
示例#4
0
def rnnt_train_pipe(files,
                    sample_rate,
                    pad_amount=0,
                    preemph_coeff=.97,
                    window_size=.02,
                    window_stride=.01,
                    window="hann",
                    nfeatures=64,
                    nfft=512,
                    frame_splicing_stack=1,
                    frame_splicing_subsample=1,
                    lowfreq=0.0,
                    highfreq=None,
                    normalize_type='per_feature',
                    speed_perturb=False,
                    silence_trim=False,
                    device='cpu'):
    assert normalize_type == 'per_feature' or normalize_type == 'all_features'
    norm_axes = [1] if normalize_type == 'per_feature' else [0, 1]
    win_len, win_hop = win_args(sample_rate, window_size, window_stride)
    window_fn = torch_windows.get(window, None)
    window_fn_arg = window_fn(
        win_len, periodic=False).numpy().tolist() if window_fn else None

    data, _ = fn.readers.file(files=files,
                              device="cpu",
                              random_shuffle=False,
                              shard_id=0,
                              num_shards=1)
    audio, _ = fn.decoders.audio(data, dtype=types.FLOAT, downmix=True)

    # splicing with subsampling doesn't work if audio_len is a GPU data node
    if device == 'gpu' and frame_splicing_subsample == 1:
        audio = audio.gpu()

    # Speed perturbation 0.85x - 1.15x
    if speed_perturb:
        target_sr_factor = fn.random.uniform(device="cpu",
                                             range=(1 / 1.15, 1 / 0.85))
        audio = fn.experimental.audio_resample(audio, scale=target_sr_factor)

    # Silence trimming
    if silence_trim:
        begin, length = fn.nonsilent_region(audio, cutoff_db=-80)
        audio = fn.slice(audio, begin, length, axes=[0])

    audio_shape = fn.shapes(audio, dtype=types.INT32)
    orig_audio_len = fn.slice(audio_shape, 0, 1, axes=(0, ))

    # If we couldn't move to GPU earlier, do it now
    if device == 'gpu' and frame_splicing_subsample > 1:
        audio = audio.gpu()

    if pad_amount > 0:
        audio_len = orig_audio_len + 2 * pad_amount
        padded_audio = dali_reflect_pad_graph(audio, orig_audio_len,
                                              pad_amount)
    else:
        audio_len = orig_audio_len
        padded_audio = audio

    # Preemphasis filter
    preemph_audio = fn.preemphasis_filter(padded_audio,
                                          preemph_coeff=preemph_coeff,
                                          border='zero')

    # Spectrogram
    spec_len = audio_len // win_hop + 1
    spec = fn.spectrogram(preemph_audio,
                          nfft=nfft,
                          window_fn=window_fn_arg,
                          window_length=win_len,
                          window_step=win_hop,
                          center_windows=True,
                          reflect_padding=True)
    # Mel spectrogram
    mel_spec = fn.mel_filter_bank(spec,
                                  sample_rate=sample_rate,
                                  nfilter=nfeatures,
                                  freq_low=lowfreq,
                                  freq_high=highfreq)

    # Log
    log_features = fn.to_decibels(mel_spec + 1e-20,
                                  multiplier=np.log(10),
                                  reference=1.0,
                                  cutoff_db=-80)

    # Frame splicing
    if frame_splicing_stack > 1 or frame_splicing_subsample > 1:
        log_features_spliced = dali_frame_splicing_graph(
            log_features,
            nfeatures,
            spec_len,
            stacking=frame_splicing_stack,
            subsampling=frame_splicing_subsample)
    else:
        log_features_spliced = log_features

    # Normalization
    if normalize_type:
        norm_log_features = fn.normalize(log_features_spliced,
                                         axes=norm_axes,
                                         device=device,
                                         epsilon=4e-5,
                                         ddof=1)
    else:
        norm_log_features = log_features_spliced

    return norm_log_features, log_features_spliced, log_features, mel_spec, spec, preemph_audio, padded_audio, audio
示例#5
0
def mel_filter_pipeline(source):
    data = fn.external_source(source=source)
    spectrum = fn.spectrogram(data, nfft=60, window_length=50, window_step=25)
    processed = fn.mel_filter_bank(spectrum)
    return processed
示例#6
0
def dali_asr_pipeline(train_pipeline,  # True if training, False if validation
                      file_root,
                      file_list,
                      sample_rate,
                      silence_threshold,
                      resample_range,
                      discrete_resample_range,
                      window_size,
                      window_stride,
                      nfeatures,
                      nfft,
                      frame_splicing_factor,
                      dither_coeff,
                      pad_align,
                      preemph_coeff,
                      do_spectrogram_masking=False,
                      cutouts_generator=None,
                      shard_id=0,
                      n_shards=1,
                      preprocessing_device="gpu"):
    do_remove_silence = silence_threshold is not None

    def _div_ceil(dividend, divisor):
        return (dividend + (divisor - 1)) // divisor

    encoded, label = fn.readers.file(
        device="cpu", name="file_reader", file_root=file_root,
        file_list=file_list, shard_id=shard_id, num_shards=n_shards,
        shuffle_after_epoch=train_pipeline)

    speed_perturbation_coeffs = None
    if resample_range is not None:
        if discrete_resample_range:
            values = [resample_range[0], 1.0, resample_range[1]]
            speed_perturbation_coeffs = fn.random.uniform(device="cpu",
                                                          values=values)
        else:
            speed_perturbation_coeffs = fn.random.uniform(device="cpu",
                                                          range=resample_range)

    if train_pipeline and speed_perturbation_coeffs is not None:
        dec_sample_rate_arg = speed_perturbation_coeffs * sample_rate
    elif resample_range is None:
        dec_sample_rate_arg = sample_rate
    else:
        dec_sample_rate_arg = None

    audio, _ = fn.decoders.audio(encoded, sample_rate=dec_sample_rate_arg,
                                 dtype=types.FLOAT, downmix=True)
    if do_remove_silence:
        begin, length = fn.nonsilent_region(audio, cutoff_db=silence_threshold)
        audio = fn.slice(audio, begin, length, axes=[0])

    # Max duration drop is performed at DataLayer stage

    if preprocessing_device == "gpu":
        audio = audio.gpu()

    if dither_coeff != 0.:
        audio = audio + fn.random.normal(audio) * dither_coeff

    audio = fn.preemphasis_filter(audio, preemph_coeff=preemph_coeff)

    spec = fn.spectrogram(audio, nfft=nfft,
                          window_length=window_size * sample_rate,
                          window_step=window_stride * sample_rate)

    mel_spec = fn.mel_filter_bank(spec, sample_rate=sample_rate,
                                  nfilter=nfeatures, normalize=True)

    log_features = fn.to_decibels(mel_spec, multiplier=np.log(10),
                                  reference=1.0, cutoff_db=math.log(1e-20))

    log_features_len = fn.shapes(log_features)
    if frame_splicing_factor != 1:
        log_features_len = _div_ceil(log_features_len, frame_splicing_factor)

    log_features = fn.normalize(log_features, axes=[1])
    log_features = fn.pad(log_features, axes=[1], fill_value=0, align=pad_align)

    if train_pipeline and do_spectrogram_masking:
        anchors, shapes = fn.external_source(source=cutouts_generator,
                                             num_outputs=2, cycle=True)
        log_features = fn.erase(log_features, anchor=anchors, shape=shapes,
                                axes=[0, 1], fill_value=0,
                                normalized_anchor=True)

    # When modifying DALI pipeline returns, make sure you update `output_map`
    # in DALIGenericIterator invocation
    return log_features.gpu(), label.gpu(), log_features_len.gpu()
示例#7
0
def rnnt_train_pipe(files,
                    sample_rate,
                    pad_amount=0,
                    preemph_coeff=.97,
                    window_size=.02,
                    window_stride=.01,
                    window="hann",
                    nfeatures=64,
                    nfft=512,
                    frame_splicing_stack=1,
                    frame_splicing_subsample=1,
                    lowfreq=0.0,
                    highfreq=None,
                    normalize_type='per_feature',
                    device='cpu'):
    assert normalize_type == 'per_feature' or normalize_type == 'all_features'
    norm_axes = [1] if normalize_type == 'per_feature' else [0, 1]
    win_len, win_hop = win_args(sample_rate, window_size, window_stride)
    window_fn = torch_windows.get(window, None)
    window_fn_arg = window_fn(
        win_len, periodic=False).numpy().tolist() if window_fn else None

    data, _ = fn.readers.file(files=files,
                              device="cpu",
                              random_shuffle=False,
                              shard_id=0,
                              num_shards=1)
    audio, _ = fn.decoders.audio(data, dtype=types.FLOAT, downmix=True)

    audio_shape = fn.shapes(audio, dtype=types.INT32)
    orig_audio_len = fn.slice(audio_shape, 0, 1, axes=(0, ))

    if pad_amount > 0:
        audio_len = orig_audio_len + 2 * pad_amount
    else:
        audio_len = orig_audio_len

    spec_len = audio_len // win_hop + 1

    if device == 'gpu':
        audio = audio.gpu()

    if pad_amount > 0:
        padded_audio = dali_reflect_pad_graph(audio, orig_audio_len,
                                              pad_amount)
    else:
        padded_audio = audio

    preemph_audio = fn.preemphasis_filter(padded_audio,
                                          preemph_coeff=preemph_coeff,
                                          border='zero')
    spec = fn.spectrogram(preemph_audio,
                          nfft=nfft,
                          window_fn=window_fn_arg,
                          window_length=win_len,
                          window_step=win_hop,
                          center_windows=True,
                          reflect_padding=True)
    mel_spec = fn.mel_filter_bank(spec,
                                  sample_rate=sample_rate,
                                  nfilter=nfeatures,
                                  freq_low=lowfreq,
                                  freq_high=highfreq)
    log_features = fn.to_decibels(mel_spec + 1e-20,
                                  multiplier=np.log(10),
                                  reference=1.0,
                                  cutoff_db=-80)

    if frame_splicing_stack > 1 or frame_splicing_subsample > 1:
        log_features_spliced = dali_frame_splicing_graph(
            log_features,
            nfeatures,
            spec_len,
            stacking=frame_splicing_stack,
            subsampling=frame_splicing_subsample)
    else:
        log_features_spliced = log_features

    if normalize_type:
        norm_log_features = fn.normalize(log_features_spliced,
                                         axes=norm_axes,
                                         device=device,
                                         epsilon=4e-5,
                                         ddof=1)
    else:
        norm_log_features = log_features_spliced

    return norm_log_features, log_features_spliced, log_features, mel_spec, spec, preemph_audio, padded_audio, audio
        def dali_jasper_pipe():
            if is_triton_pipeline:
                assert not self.train, "Pipeline for Triton shall be a validation pipeline"
                if torch.distributed.is_initialized():
                    raise RuntimeError(
                        "You're creating Triton pipeline, using multi-process mode. Please use single-process mode."
                    )
                encoded, label = fn.external_source(device="cpu",
                                                    name="DALI_INPUT_0",
                                                    no_copy=True)
            else:
                encoded, label = fn.readers.file(
                    device="cpu",
                    name="file_reader",
                    file_root=file_root,
                    file_list=file_list,
                    shard_id=shard_id,
                    num_shards=n_shards,
                    shuffle_after_epoch=train_pipeline)

            speed_perturbation_coeffs = None
            if resample_range is not None:
                if discrete_resample_range:
                    values = [
                        self.resample_range[0], 1.0, self.resample_range[1]
                    ]
                    speed_perturbation_coeffs = fn.random.uniform(
                        device="cpu", values=values)
                else:
                    speed_perturbation_coeffs = fn.random.uniform(
                        device="cpu", range=resample_range)

            if self.train and speed_perturbation_coeffs is not None:
                dec_sample_rate_arg = speed_perturbation_coeffs * self.sample_rate
            elif resample_range is None:
                dec_sample_rate_arg = self.sample_rate
            else:
                dec_sample_rate_arg = None

            audio, _ = fn.decoders.audio(encoded,
                                         sample_rate=dec_sample_rate_arg,
                                         dtype=types.FLOAT,
                                         downmix=True)

            if self.do_remove_silence:
                begin, length = fn.nonsilent_region(
                    audio, cutoff_db=silence_threshold)
                audio = fn.slice(audio, begin, length, axes=[0])

            # Max duration drop is performed at DataLayer stage

            if self.preprocessing_device == "gpu":
                audio = audio.gpu()

            if self.dither_coeff != 0.:
                audio = audio + fn.random.normal(
                    device=preprocessing_device) * self.dither_coeff

            audio = fn.preemphasis_filter(audio, preemph_coeff=preemph_coeff)

            spec = fn.spectrogram(audio,
                                  nfft=nfft,
                                  window_length=window_size * sample_rate,
                                  window_step=window_stride * sample_rate)

            mel_spec = fn.mel_filter_bank(spec,
                                          sample_rate=sample_rate,
                                          nfilter=self.nfeatures,
                                          normalize=True)

            log_features = fn.to_decibels(mel_spec,
                                          multiplier=np.log(10),
                                          reference=1.0,
                                          cutoff_db=math.log(1e-20))

            log_features_len = fn.shapes(log_features)
            if self.frame_splicing_factor != 1:
                log_features_len = self._div_ceil(log_features_len,
                                                  self.frame_splicing_factor)

            log_features = fn.normalize(log_features, axes=[1])
            log_features = fn.pad(log_features,
                                  axes=[1],
                                  fill_value=0,
                                  align=pad_align)

            if self.train and self._do_spectrogram_masking():
                anchors, shapes = fn.external_source(
                    source=self._cutouts_generator, num_outputs=2, cycle=True)
                log_features = fn.erase(log_features,
                                        anchor=anchors,
                                        shape=shapes,
                                        axes=[0, 1],
                                        fill_value=0,
                                        normalized_anchor=True)

            # When modifying DALI pipeline returns, make sure you update `output_map` in DALIGenericIterator invocation
            return log_features.gpu(), label.gpu(), log_features_len.gpu()