예제 #1
0
def mfcc_sad(debugging):
  recipe = pp.make_pipeline(steps=[
      pp.speech.AudioReader(sr=16000, sr_new=8000),
      pp.speech.PreEmphasis(coeff=0.97),
      pp.base.Converter(converter=WAV_FILES,
                        input_name='path', output_name='name'),
      # ====== STFT ====== #
      pp.speech.STFTExtractor(frame_length=0.025, step_length=0.01,
                              n_fft=512, energy=False),
      # ====== spectrogram ====== #
      pp.speech.PowerSpecExtractor(power=2.0, output_name='spec'),
      pp.speech.MelsSpecExtractor(n_mels=24, fmin=20, fmax=3700,
                                  output_name='mspec'),
      pp.speech.MFCCsExtractor(n_ceps=24, remove_first_coef=True,
                               first_coef_energy=True,
                               output_name='mfcc'),
      # ====== SAD ====== #
      pp.speech.SADthreshold(energy_threshold=0.5, smooth_window=5,
                             output_name='sad'),
      pp.speech.SADgmm(nb_mixture=3, smooth_window=3,
                       input_name='energy', output_name='sad'),
      pp.speech.ApplyingSAD(input_name=('mspec',)),
      pp.speech.AcousticNorm(input_name=('mspec',), mean_var_norm=True,
                             windowed_mean_var_norm=True, win_length=121),
      # ====== cleaning ====== #
      pp.base.DeleteFeatures(input_name=('stft', 'raw', 'spec', 'sad',
                                         'sad_threshold', 'energy')),
      pp.base.AsType(dtype='float16')
  ], debug=debugging)
  return recipe
예제 #2
0
def mfcc(augmentation=None):
    delete_list = ['stft', 'spec', 'raw', 'mfcc_energy', 'sad_threshold']
    if augmentation is not None:
        delete_list.append('sad')

    extractors = pp.make_pipeline(steps=[
        SREAugmentor(augmentation) if isinstance(augmentation, string_types
                                                 ) else SREAudioReader(),
        pp.speech.PreEmphasis(coeff=0.97, input_name='raw'),
        # ====== STFT ====== #
        pp.speech.STFTExtractor(frame_length=Config.FRAME_LENGTH,
                                step_length=Config.STEP_LENGTH,
                                n_fft=Config.NFFT,
                                window=Config.WINDOW,
                                padding=False,
                                energy=False),
        # ====== for x-vector ====== #
        pp.speech.PowerSpecExtractor(
            power=2.0, input_name='stft', output_name='spec'),
        pp.speech.MelsSpecExtractor(n_mels=24,
                                    fmin=20,
                                    fmax=3700,
                                    input_name=('spec', 'sr'),
                                    output_name='mspec'),
        pp.speech.MFCCsExtractor(n_ceps=24,
                                 remove_first_coef=True,
                                 first_coef_energy=True,
                                 input_name='mspec',
                                 output_name='mfcc'),
        # ====== extract SAD ====== #
        pp.speech.SADthreshold(energy_threshold=0.5,
                               energy_mean_scale=0.5,
                               frame_context=2,
                               proportion_threshold=0.12,
                               smooth_window=5,
                               input_name='mfcc_energy',
                               output_name='sad') if augmentation is None else
        SADreader(ds_path=os.path.join(PATH_ACOUSTIC_FEATURES, 'mfcc')),
        pp.speech.ApplyingSAD(input_name=('mspec', 'mfcc'),
                              sad_name='sad',
                              keep_unvoiced=False if CURRENT_STATE ==
                              SystemStates.EXTRACT_FEATURES else True),
        # ====== normalization ====== #
        pp.speech.AcousticNorm(mean_var_norm=True,
                               windowed_mean_var_norm=True,
                               win_length=301,
                               input_name=('mspec', 'mfcc')),
        # ====== post processing ====== #
        pp.base.DeleteFeatures(input_name=delete_list),
        pp.base.AsType(dtype='float16'),
    ])
    return extractors
예제 #3
0
def bnf(augmentation=None):
    raise NotImplementedError
    bnf_network = N.models.BNF_2048_MFCC40()
    recipe = pp.make_pipeline(steps=[
        SREAudioReader(),
        pp.speech.PreEmphasis(coeff=0.97),
        # ====== STFT ====== #
        pp.speech.STFTExtractor(frame_length=Config.FRAME_LENGTH,
                                step_length=Config.STEP_LENGTH,
                                n_fft=Config.NFFT,
                                window=Config.WINDOW),
        # ====== SAD ====== #
        pp.base.RenameFeatures(input_name='stft_energy', output_name='energy'),
        pp.speech.SADgmm(nb_mixture=3,
                         smooth_window=3,
                         input_name='energy',
                         output_name='sad'),
        # ====== BNF ====== #
        pp.speech.PowerSpecExtractor(power=2.0, output_name='spec'),
        pp.speech.MelsSpecExtractor(n_mels=Config.NCEPS,
                                    fmin=Config.FMIN,
                                    fmax=Config.FMAX,
                                    input_name='spec',
                                    output_name='mspec'),
        pp.speech.MFCCsExtractor(n_ceps=Config.NCEPS,
                                 remove_first_coef=False,
                                 input_name='mspec',
                                 output_name='mfcc'),
        pp.base.AsType(dtype='float32', input_name='mfcc'),
        pp.speech.BNFExtractor(input_name='mfcc',
                               output_name='bnf',
                               sad_name='sad',
                               network=bnf_network,
                               remove_non_speech=True,
                               stack_context=10,
                               pre_mvn=True,
                               batch_size=5218),
        # ====== normalization ====== #
        pp.speech.AcousticNorm(input_name=('bnf', ),
                               mean_var_norm=True,
                               windowed_mean_var_norm=True,
                               win_length=301),
        # ====== cleaning ====== #
        pp.base.DeleteFeatures(input_name=('stft', 'raw', 'energy',
                                           'sad_threshold', 'spec', 'mspec',
                                           'mfcc')),
        pp.base.AsType(dtype='float16')
    ])
    return recipe
예제 #4
0
def bnf_sad(debugging):
  bnf_network = N.models.BNF_2048_MFCC40()
  recipe = pp.make_pipeline(steps=[
      pp.speech.AudioReader(sr=16000, sr_new=8000,
                            best_resample=True, remove_dc=True),
      pp.speech.PreEmphasis(coeff=0.97),
      pp.base.Converter(converter=WAV_FILES,
                        input_name='path', output_name='name'),
      # ====== STFT ====== #
      pp.speech.STFTExtractor(frame_length=0.025, step_length=0.010,
                              window='hamm', n_fft=512),
      # ====== SAD ====== #
      pp.base.RenameFeatures(input_name='stft_energy', output_name='energy'),
      pp.speech.SADgmm(nb_mixture=3, smooth_window=3,
                       input_name='energy', output_name='sad'),
      # ====== spectrogram ====== #
      pp.speech.PowerSpecExtractor(power=2.0, output_name='spec'),
      pp.speech.MelsSpecExtractor(n_mels=24, fmin=100, fmax=4000,
                                  input_name='spec', output_name='mspec'),
      pp.speech.MFCCsExtractor(n_ceps=20, remove_first_coef=True,
                               input_name='mspec', output_name='mfcc'),
      pp.base.DeltaExtractor(input_name='mfcc', order=(0, 1, 2)),
      pp.speech.ApplyingSAD(input_name=('mspec', 'mfcc')),
      # ====== BNF ====== #
      pp.speech.MelsSpecExtractor(n_mels=40, fmin=100, fmax=4000,
                                  input_name='spec', output_name='mspec_bnf'),
      pp.speech.MFCCsExtractor(n_ceps=40, remove_first_coef=False,
                               input_name='mspec_bnf', output_name='mfcc_bnf'),
      pp.base.AsType(dtype='float32', input_name='mfcc_bnf'),
      pp.speech.BNFExtractor(input_name='mfcc_bnf', output_name='bnf',
                             sad_name='sad',
                             network=bnf_network,
                             remove_non_speech=True,
                             stack_context=10, pre_mvn=True,
                             batch_size=5218),
      # ====== normalization ====== #
      pp.speech.AcousticNorm(input_name=('mspec', 'bnf', 'mfcc'),
                             mean_var_norm=True,
                             windowed_mean_var_norm=True,
                             win_length=301),
      # ====== cleaning ====== #
      pp.base.DeleteFeatures(input_name=('stft', 'raw', 'energy',
                                         'sad', 'sad_threshold',
                                         'spec', 'mspec_bnf', 'mfcc_bnf')),
      pp.base.AsType(dtype='float16')
  ], debug=debugging)
  return recipe
예제 #5
0
def mfcc_sad(debugging):
    recipe = pp.make_pipeline(
        steps=[
            pp.speech.AudioReader(sr=16000, sr_new=8000),
            pp.speech.PreEmphasis(coeff=0.97),
            pp.base.Converter(converter=WAV_FILES,
                              input_name='path',
                              output_name='name'),
            # ====== STFT ====== #
            pp.speech.STFTExtractor(frame_length=0.025,
                                    step_length=0.01,
                                    n_fft=512,
                                    energy=False),
            # ====== spectrogram ====== #
            pp.speech.PowerSpecExtractor(power=2.0, output_name='spec'),
            pp.speech.MelsSpecExtractor(n_mels=24,
                                        fmin=20,
                                        fmax=3700,
                                        output_name='mspec'),
            pp.speech.MFCCsExtractor(n_ceps=24,
                                     remove_first_coef=True,
                                     first_coef_energy=True,
                                     output_name='mfcc'),
            # ====== SAD ====== #
            pp.speech.SADthreshold(energy_threshold=0.5,
                                   smooth_window=5,
                                   output_name='sad'),
            pp.speech.SADgmm(nb_mixture=3,
                             smooth_window=3,
                             input_name='energy',
                             output_name='sad'),
            pp.speech.ApplyingSAD(input_name=('mspec', )),
            pp.speech.AcousticNorm(input_name=('mspec', ),
                                   mean_var_norm=True,
                                   windowed_mean_var_norm=True,
                                   win_length=121),
            # ====== cleaning ====== #
            pp.base.DeleteFeatures(input_name=('stft', 'raw', 'spec', 'sad',
                                               'sad_threshold', 'energy')),
            pp.base.AsType(dtype='float16')
        ],
        debug=debugging)
    return recipe
예제 #6
0
def mfcc(augmentation=None):
  delete_list = ['stft', 'spec', 'raw',
                 'mfcc_energy', 'sad_threshold']
  if augmentation is not None:
    delete_list.append('sad')

  extractors = pp.make_pipeline(steps=[
      SREAugmentor(augmentation)
      if isinstance(augmentation, string_types) else
      SREAudioReader(),
      pp.speech.PreEmphasis(coeff=0.97, input_name='raw'),
      # ====== STFT ====== #
      pp.speech.STFTExtractor(frame_length=Config.FRAME_LENGTH,
                              step_length=Config.STEP_LENGTH,
                              n_fft=Config.NFFT, window=Config.WINDOW,
                              padding=False, energy=False),
      # ====== for x-vector ====== #
      pp.speech.PowerSpecExtractor(power=2.0,
                                   input_name='stft', output_name='spec'),
      pp.speech.MelsSpecExtractor(n_mels=24, fmin=20, fmax=3700,
                                  input_name=('spec', 'sr'), output_name='mspec'),
      pp.speech.MFCCsExtractor(n_ceps=24,
                               remove_first_coef=True, first_coef_energy=True,
                               input_name='mspec', output_name='mfcc'),
      # ====== extract SAD ====== #
      pp.speech.SADthreshold(energy_threshold=0.5, energy_mean_scale=0.5,
                             frame_context=2, proportion_threshold=0.12,
                             smooth_window=5,
                             input_name='mfcc_energy', output_name='sad')
      if augmentation is None else
      SADreader(ds_path=os.path.join(PATH_ACOUSTIC_FEATURES, 'mfcc')),
      pp.speech.ApplyingSAD(input_name=('mspec', 'mfcc'), sad_name='sad',
          keep_unvoiced=False if CURRENT_STATE == SystemStates.EXTRACT_FEATURES else True),
      # ====== normalization ====== #
      pp.speech.AcousticNorm(mean_var_norm=True, windowed_mean_var_norm=True,
                             win_length=301, input_name=('mspec', 'mfcc')),
      # ====== post processing ====== #
      pp.base.DeleteFeatures(input_name=delete_list),
      pp.base.AsType(dtype='float16'),
  ])
  return extractors
예제 #7
0
def bnf(augmentation=None):
  raise NotImplementedError
  bnf_network = N.models.BNF_2048_MFCC40()
  recipe = pp.make_pipeline(steps=[
      SREAudioReader(),
      pp.speech.PreEmphasis(coeff=0.97),
      # ====== STFT ====== #
      pp.speech.STFTExtractor(frame_length=Config.FRAME_LENGTH,
                              step_length=Config.STEP_LENGTH,
                              n_fft=Config.NFFT,
                              window=Config.WINDOW),
      # ====== SAD ====== #
      pp.base.RenameFeatures(input_name='stft_energy', output_name='energy'),
      pp.speech.SADgmm(nb_mixture=3, smooth_window=3,
                       input_name='energy', output_name='sad'),
      # ====== BNF ====== #
      pp.speech.PowerSpecExtractor(power=2.0, output_name='spec'),
      pp.speech.MelsSpecExtractor(n_mels=Config.NCEPS,
                                  fmin=Config.FMIN, fmax=Config.FMAX,
                                  input_name='spec', output_name='mspec'),
      pp.speech.MFCCsExtractor(n_ceps=Config.NCEPS, remove_first_coef=False,
                               input_name='mspec', output_name='mfcc'),
      pp.base.AsType(dtype='float32', input_name='mfcc'),
      pp.speech.BNFExtractor(input_name='mfcc', output_name='bnf', sad_name='sad',
                             network=bnf_network,
                             remove_non_speech=True,
                             stack_context=10, pre_mvn=True,
                             batch_size=5218),
      # ====== normalization ====== #
      pp.speech.AcousticNorm(input_name=('bnf',),
                             mean_var_norm=True,
                             windowed_mean_var_norm=True,
                             win_length=301),
      # ====== cleaning ====== #
      pp.base.DeleteFeatures(input_name=('stft', 'raw', 'energy',
                                         'sad_threshold',
                                         'spec', 'mspec', 'mfcc')),
      pp.base.AsType(dtype='float16')
  ])
  return recipe
예제 #8
0
 extractors = pp.make_pipeline(
     steps=[
         pp.speech.AudioReader(sr_new=8000,
                               best_resample=True,
                               remove_dc=True),
         pp.speech.PreEmphasis(coeff=0.97),
         pp.base.Converter(
             converter=lambda x: os.path.basename(x).split('.')[0],
             input_name='path',
             output_name='name'),
         # ====== STFT ====== #
         pp.speech.STFTExtractor(frame_length=0.025,
                                 step_length=0.005,
                                 n_fft=512,
                                 window='hamm',
                                 energy=False),
         # ====== spectrogram ====== #
         pp.speech.PowerSpecExtractor(power=2.0, output_name='spec'),
         pp.speech.MelsSpecExtractor(n_mels=24,
                                     fmin=64,
                                     fmax=4000,
                                     input_name=('spec', 'sr'),
                                     output_name='mspec'),
         pp.speech.MFCCsExtractor(n_ceps=20,
                                  remove_first_coef=True,
                                  first_coef_energy=True,
                                  input_name='mspec',
                                  output_name='mfcc'),
         pp.base.DeltaExtractor(input_name='mfcc', order=(0, 1, 2)),
         # ====== SAD ====== #
         pp.base.RenameFeatures(input_name='mfcc_energy',
                                output_name='energy'),
         pp.speech.SADthreshold(energy_threshold=0.55,
                                smooth_window=5,
                                input_name='energy',
                                output_name='sad'),
         # ====== normalization ====== #
         pp.base.DeleteFeatures(input_name=('stft', 'spec', 'sad_threshold')
                                ),
         pp.speech.AcousticNorm(mean_var_norm=True,
                                windowed_mean_var_norm=True,
                                input_name=('mspec', 'mfcc')),
         # ====== post processing ====== #
         pp.base.AsType(dtype='float16'),
     ],
     debug=False)
예제 #9
0
    mpi = MPI(jobs=cmds, func=mpi_fn, ncpu=cpu_count() - 1, batch=12)
    for i in mpi:
        prog.add(i)
# ===========================================================================
# Extract Acoustic features
# ===========================================================================
jobs = get_all_files(wav_path, filter_func=lambda x: '.wav' == x[-4:])
assert len(jobs) == TOTAL_FILES
# ====== configuration ====== #
if not os.path.exists(outpath) or args.ds:
    extractors = pp.make_pipeline(steps=[
        pp.speech.AudioReader(sr=None,
                              sr_new=8000,
                              best_resample=True,
                              remove_dc=True),
        pp.base.Converter(
            converter=lambda x: os.path.basename(x).split('.')[0],
            input_name='path',
            output_name='name'),
        pp.base.AsType(dtype='float16', input_name='raw')
    ],
                                  debug=False)
    processor = pp.FeatureProcessor(jobs=jobs,
                                    path=outpath,
                                    extractor=extractors,
                                    n_cache=0.08,
                                    ncpu=None,
                                    override=True)
    processor.run()
    pp.validate_features(processor,
                         path='/tmp/tidigits',
                         nb_samples=12,
예제 #10
0
extractors = pp.make_pipeline(steps=[
    pp.speech.AudioReader(sr=FeatureConfigs.sr, dataset=audio),
    pp.speech.PreEmphasis(coeff=0.97),
    pp.speech.Dithering(),
    # ====== STFT ====== #
    pp.speech.STFTExtractor(frame_length=FeatureConfigs.frame_length,
                            step_length=FeatureConfigs.step_length,
                            n_fft=FeatureConfigs.n_fft,
                            window=FeatureConfigs.window),
    # ====== SAD ====== #
    pp.base.RenameFeatures(input_name='stft_energy', output_name='energy'),
    pp.speech.SADgmm(nb_mixture=3, nb_train_it=25,
                           input_name='energy', output_name='sad'),
    # ====== for x-vector ====== #
    pp.speech.PowerSpecExtractor(power=2.0, output_name='spec'),
    pp.speech.MelsSpecExtractor(n_mels=24,
                                fmin=FeatureConfigs.fmin, fmax=FeatureConfigs.fmax,
                                input_name=('spec', 'sr'), output_name='mspec'),
    # ====== BNF ====== #
    pp.speech.MelsSpecExtractor(n_mels=FeatureConfigs.n_mels,
                                fmin=FeatureConfigs.fmin, fmax=FeatureConfigs.fmax,
                                input_name=('spec', 'sr'), output_name='mspec_bnf'),
    pp.speech.MFCCsExtractor(n_ceps=FeatureConfigs.n_ceps, remove_first_coef=False,
                             input_name='mspec', output_name='mfcc_bnf'),
    pp.base.AsType(dtype='float32', input_name='mfcc_bnf'),
    pp.speech.BNFExtractor(input_name='mfcc_bnf', output_name='bnf',
                           stack_context=10, pre_mvn=True,
                           sad_name='sad', remove_non_speech=False,
                           network=bnf_network,
                           batch_size=2048),
    # ====== MFCCs with deltas ====== #
    pp.speech.MFCCsExtractor(n_ceps=20, remove_first_coef=True,
                             input_name='mspec', output_name='mfcc'),
    pp.base.DeltaExtractor(input_name='mfcc', order=(0, 1, 2)),
    # ====== SDC ====== #
    pp.speech.MFCCsExtractor(n_ceps=7, remove_first_coef=True,
                             input_name='mspec', output_name='sdc'),
    pp.speech.RASTAfilter(rasta=True, input_name='sdc', output_name='sdc'),
    # ====== normalization ====== #
    pp.base.DeleteFeatures(input_name=('stft', 'spec',
                                       'mspec_bnf', 'mfcc_bnf',
                                       'sad_threshold')),
    pp.speech.AcousticNorm(mean_var_norm=True, windowed_mean_var_norm=True,
                           sad_name=None, ignore_sad_error=True,
                           input_name=('mspec', 'mfcc', 'sdc', 'bnf')),
    # ====== post processing ====== #
    pp.base.EqualizeShape0(input_name=('mspec', 'mfcc', 'sdc', 'bnf',
                                       'energy', 'sad')),
    pp.base.AsType(dtype='float16'),
], debug=args.debug)
예제 #11
0
def bnf_all(debugging):
    bnf_network = N.models.BNF_2048_MFCC40()
    recipe = pp.make_pipeline(
        steps=[
            pp.speech.AudioReader(sr=16000,
                                  sr_new=8000,
                                  best_resample=True,
                                  remove_dc=True),
            pp.speech.PreEmphasis(coeff=0.97),
            pp.base.Converter(converter=WAV_FILES,
                              input_name='path',
                              output_name='name'),
            # ====== STFT ====== #
            pp.speech.STFTExtractor(frame_length=0.025,
                                    step_length=0.010,
                                    window='hamm',
                                    n_fft=512),
            # ====== SAD ====== #
            pp.base.RenameFeatures(input_name='stft_energy',
                                   output_name='energy'),
            pp.speech.SADgmm(nb_mixture=3,
                             smooth_window=3,
                             input_name='energy',
                             output_name='sad'),
            # ====== spectrogram ====== #
            pp.speech.PowerSpecExtractor(power=2.0, output_name='spec'),
            pp.speech.MelsSpecExtractor(n_mels=24,
                                        fmin=100,
                                        fmax=4000,
                                        input_name='spec',
                                        output_name='mspec'),
            pp.speech.MFCCsExtractor(n_ceps=20,
                                     remove_first_coef=True,
                                     input_name='mspec',
                                     output_name='mfcc'),
            pp.base.DeltaExtractor(input_name='mfcc', order=(0, 1, 2)),
            # ====== BNF ====== #
            pp.speech.MelsSpecExtractor(n_mels=40,
                                        fmin=100,
                                        fmax=4000,
                                        input_name='spec',
                                        output_name='mspec_bnf'),
            pp.speech.MFCCsExtractor(n_ceps=40,
                                     remove_first_coef=False,
                                     input_name='mspec_bnf',
                                     output_name='mfcc_bnf'),
            pp.base.AsType(dtype='float32', input_name='mfcc_bnf'),
            pp.speech.BNFExtractor(input_name='mfcc_bnf',
                                   output_name='bnf',
                                   sad_name='sad',
                                   network=bnf_network,
                                   remove_non_speech=False,
                                   stack_context=10,
                                   pre_mvn=True,
                                   batch_size=1234),
            # ====== normalization ====== #
            pp.speech.AcousticNorm(input_name=('mspec', 'bnf', 'mfcc'),
                                   mean_var_norm=True,
                                   windowed_mean_var_norm=True,
                                   win_length=301),
            # ====== cleaning ====== #
            pp.base.DeleteFeatures(input_name=('stft', 'raw', 'energy', 'sad',
                                               'sad_threshold', 'spec',
                                               'mspec_bnf', 'mfcc_bnf')),
            pp.base.AsType(dtype='float16')
        ],
        debug=debugging)
    return recipe
예제 #12
0
        print(fmt % tuple(line))


# ===========================================================================
# More detail pipeline
# ===========================================================================
pp1 = make_pipeline(steps=[
    speech.AudioReader(),
    speech.STFTExtractor(frame_length=0.025, padding=False),
    # spectra analysis
    speech.PowerSpecExtractor(output_name='spec', power=1.0),
    speech.PowerSpecExtractor(output_name='pspec', power=2.0),
    speech.Power2Db(input_name='pspec', output_name='db'),
    # Cepstra analysis
    speech.MelsSpecExtractor(n_mels=40, input_name=('pspec', 'sr')),
    speech.MFCCsExtractor(n_ceps=13, input_name='mspec'),
    # others
    speech.PitchExtractor(frame_length=0.025, f0=True),
    speech.SADgmm(input_name='stft_energy'),
    speech.RASTAfilter(input_name='mfcc', output_name='rasta'),
    base.EqualizeShape0(input_name=None),
    speech.AcousticNorm(input_name=('mfcc', 'mspec', 'spec'),
                        output_name=('mfcc_norm', 'mspec_norm', 'spec_norm')),
    speech.ApplyingSAD(input_name='mfcc', output_name='mfcc_sad'),
    base.StackFeatures(n_context=4, input_name='mfcc')
])
formatted_printer(feats=pp1.transform(AUDIO_PATH))
print("///////////////////////////")
# ===========================================================================
# Fast pipeline
# ===========================================================================
pp2 = make_pipeline(steps=[
예제 #13
0
extractors = pp.make_pipeline(
    steps=[
        pp.speech.AudioReader(sr=FeatureConfigs.sr, dataset=audio),
        pp.speech.PreEmphasis(coeff=0.97),
        pp.speech.Dithering(),
        # ====== STFT ====== #
        pp.speech.STFTExtractor(frame_length=FeatureConfigs.frame_length,
                                step_length=FeatureConfigs.step_length,
                                n_fft=FeatureConfigs.n_fft,
                                window=FeatureConfigs.window),
        # ====== SAD ====== #
        pp.base.RenameFeatures(input_name='stft_energy', output_name='energy'),
        pp.speech.SADgmm(nb_mixture=3,
                         nb_train_it=25,
                         input_name='energy',
                         output_name='sad'),
        # ====== for x-vector ====== #
        pp.speech.PowerSpecExtractor(power=2.0, output_name='spec'),
        pp.speech.MelsSpecExtractor(n_mels=24,
                                    fmin=FeatureConfigs.fmin,
                                    fmax=FeatureConfigs.fmax,
                                    input_name=('spec', 'sr'),
                                    output_name='mspec'),
        # ====== BNF ====== #
        pp.speech.MelsSpecExtractor(n_mels=FeatureConfigs.n_mels,
                                    fmin=FeatureConfigs.fmin,
                                    fmax=FeatureConfigs.fmax,
                                    input_name=('spec', 'sr'),
                                    output_name='mspec_bnf'),
        pp.speech.MFCCsExtractor(n_ceps=FeatureConfigs.n_ceps,
                                 remove_first_coef=False,
                                 input_name='mspec',
                                 output_name='mfcc_bnf'),
        pp.base.AsType(dtype='float32', input_name='mfcc_bnf'),
        pp.speech.BNFExtractor(input_name='mfcc_bnf',
                               output_name='bnf',
                               stack_context=10,
                               pre_mvn=True,
                               sad_name='sad',
                               remove_non_speech=False,
                               network=bnf_network,
                               batch_size=2048),
        # ====== MFCCs with deltas ====== #
        pp.speech.MFCCsExtractor(n_ceps=20,
                                 remove_first_coef=True,
                                 input_name='mspec',
                                 output_name='mfcc'),
        pp.base.DeltaExtractor(input_name='mfcc', order=(0, 1, 2)),
        # ====== SDC ====== #
        pp.speech.MFCCsExtractor(n_ceps=7,
                                 remove_first_coef=True,
                                 input_name='mspec',
                                 output_name='sdc'),
        pp.speech.RASTAfilter(rasta=True, input_name='sdc', output_name='sdc'),
        # ====== normalization ====== #
        pp.base.DeleteFeatures(input_name=('stft', 'spec', 'mspec_bnf',
                                           'mfcc_bnf', 'sad_threshold')),
        pp.speech.AcousticNorm(mean_var_norm=True,
                               windowed_mean_var_norm=True,
                               sad_name=None,
                               ignore_sad_error=True,
                               input_name=('mspec', 'mfcc', 'sdc', 'bnf')),
        # ====== post processing ====== #
        pp.base.EqualizeShape0(input_name=('mspec', 'mfcc', 'sdc', 'bnf',
                                           'energy', 'sad')),
        pp.base.AsType(dtype='float16'),
    ],
    debug=args.debug)
예제 #14
0
  mpi = MPI(jobs=cmds, func=mpi_fn,
            ncpu=cpu_count() - 1, batch=12)
  for i in mpi:
    prog.add(i)
# ===========================================================================
# Extract Acoustic features
# ===========================================================================
jobs = get_all_files(wav_path,
                     filter_func=lambda x: '.wav' == x[-4:])
assert len(jobs) == TOTAL_FILES
# ====== configuration ====== #
if not os.path.exists(outpath) or args.ds:
  extractors = pp.make_pipeline(steps=[
      pp.speech.AudioReader(sr=None, sr_new=8000, best_resample=True,
                            remove_dc=True),
      pp.base.Converter(converter=lambda x: os.path.basename(x).split('.')[0],
                        input_name='path', output_name='name'),
      pp.base.AsType(dtype='float16', input_name='raw')
  ], debug=False)
  processor = pp.FeatureProcessor(jobs=jobs, path=outpath, extractor=extractors,
                                  n_cache=0.08, ncpu=None, override=True)
  processor.run()
  pp.validate_features(processor, path='/tmp/tidigits', nb_samples=12,
                       override=True)
  with open(os.path.join(outpath, 'README'), 'w') as f:
    f.write(README)
# ====== check the preprocessed dataset ====== #
ds = F.Dataset(outpath, read_only=True)
print(ds)
print(ctext(ds.md5, 'yellow'))
ds.close()
예제 #15
0
파일: fsdd_ivec.py 프로젝트: imito/odin
len(os.listdir(PATH_ACOUSTIC_FEATURES)) != 14 or \
bool(args.acous):
  extractors = pp.make_pipeline(steps=[
      pp.speech.AudioReader(sr_new=8000, best_resample=True, remove_dc=True),
      pp.speech.PreEmphasis(coeff=0.97),
      pp.base.Converter(converter=lambda x: os.path.basename(x).split('.')[0],
                        input_name='path', output_name='name'),
      # ====== STFT ====== #
      pp.speech.STFTExtractor(frame_length=0.025, step_length=0.005,
                              n_fft=512, window='hamm', energy=False),
      # ====== spectrogram ====== #
      pp.speech.PowerSpecExtractor(power=2.0, output_name='spec'),
      pp.speech.MelsSpecExtractor(n_mels=24, fmin=64, fmax=4000,
                                  input_name=('spec', 'sr'), output_name='mspec'),
      pp.speech.MFCCsExtractor(n_ceps=20,
                               remove_first_coef=True, first_coef_energy=True,
                               input_name='mspec', output_name='mfcc'),
      pp.base.DeltaExtractor(input_name='mfcc', order=(0, 1, 2)),
      # ====== SAD ====== #
      pp.base.RenameFeatures(input_name='mfcc_energy', output_name='energy'),
      pp.speech.SADthreshold(energy_threshold=0.55, smooth_window=5,
                             input_name='energy', output_name='sad'),
      # ====== normalization ====== #
      pp.base.DeleteFeatures(input_name=('stft', 'spec', 'sad_threshold')),
      pp.speech.AcousticNorm(mean_var_norm=True, windowed_mean_var_norm=True,
                             input_name=('mspec', 'mfcc')),
      # ====== post processing ====== #
      pp.base.AsType(dtype='float16'),
  ], debug=False)
  with np.warnings.catch_warnings():
    np.warnings.filterwarnings('ignore')