def synthesizef(fs, shift=0.005, dftlen=4096, ff0=None, flf0=None, fspec=None, fmcep=None, fpdd=None, fnm=None, fbndnm=None, fsyn=None, verbose=1): ''' Call the synthesis from python using file inputs and outputs ''' if ff0: f0 = np.fromfile(ff0, dtype=np.float32) if flf0: f0 = np.fromfile(flf0, dtype=np.float32) f0[f0 > 0] = np.exp(f0[f0 > 0]) ts = (shift) * np.arange(len(f0)) f0s = np.vstack((ts, f0)).T if fspec: SPEC = np.fromfile(fspec, dtype=np.float32) SPEC = SPEC.reshape((len(f0), -1)) if fmcep: SPEC = np.fromfile(fmcep, dtype=np.float32) SPEC = SPEC.reshape((len(f0), -1)) SPEC = sp.mcep2spec(SPEC, sp.bark_alpha(fs), dftlen) if fpdd: PDD = np.fromfile(fpdd, dtype=np.float32) PDD = PDD.reshape((len(f0), -1)) thresh = 0.75 # DegottexG2015jhmpd NM = PDD.copy() NM[PDD < thresh] = 0.0 NM[PDD > thresh] = 1.0 if fnm: NM = np.fromfile(fnm, dtype=np.float32) NM = NM.reshape((len(f0), -1)) if fbndnm: BNDNM = np.fromfile(fbndnm, dtype=np.float32) BNDNM = BNDNM.reshape((len(f0), -1)) NM = sp.fwbnd2linbnd(BNDNM, fs, dftlen) NM[NM <= 0.5] = 0.0 NM[NM > 0.5] = 1.0 syn = synthesize(fs, f0s, SPEC, NM=NM, verbose=verbose) if fsyn: sp.wavwrite(fsyn, syn, fs, norm_abs=True, verbose=verbose) return syn
def analysisf( fwav, shift=0.005, dftlen=4096, inf0txt_file=None, f0_min=60, f0_max=600, f0_file=None, f0_log=False, inf0bin_file=None, # input f0 file in binary spec_file=None, spec_order=None, # Mel-cepstral order for compressing the # spectrum (typically 59; None: no compression) pdd_file=None, pdd_order=None, # Mel-cepstral order for compressing PDD # spectrum (typically 59; None: no compression) nm_file=None, nm_nbbnds=None, # Number of mel-bands in the compressed mask # (None: no compression) verbose=1): wav, fs, enc = sp.wavread(fwav) if verbose > 0: print( 'PM Analysis (dur={:.3f}s, fs={}Hz, f0 in [{},{}]Hz, shift={}s, dftlen={})' .format(len(wav) / float(fs), fs, f0_min, f0_max, shift, dftlen)) f0s = None if inf0txt_file: f0s = np.loadtxt(inf0txt_file) # read input f0 file in float32 (ljuvela) if inf0bin_file: f0s = np.fromfile(inf0bin_file, dtype=np.float32) f0s = analysis_f0postproc(wav, fs, f0s, f0_min=f0_min, f0_max=f0_max, shift=shift, verbose=verbose) if f0_file: f0_values = f0s[:, 1] if verbose > 0: print(' Output F0 {} in: {}'.format(f0_values.shape, f0_file)) if f0_log: f0_values = np.log(f0_values) f0_values.astype(np.float32).tofile(f0_file) SPEC = None if spec_file: SPEC = analysis_spec(wav, fs, f0s, shift=shift, dftlen=dftlen, verbose=verbose) if not spec_order is None: SPEC = sp.spec2mcep(SPEC, sp.bark_alpha(fs), order=spec_order) if verbose > 0: print(' Output Spectrogram size={} in: {}'.format( SPEC.shape, spec_file)) SPEC.astype(np.float32).tofile(spec_file) PDD = None if pdd_file or nm_file: PDD = analysis_pdd(wav, fs, f0s, dftlen=dftlen, verbose=verbose) if pdd_file: if not pdd_order is None: # If asked, compress PDD PDD[PDD < 0.001] = 0.001 # From COVAREP PDD = sp.spec2mcep(PDD, sp.bark_alpha(fs), pdd_order) if verbose > 0: print(' Output PDD size={} in: {}'.format(PDD.shape, pdd_file)) PDD.astype(np.float32).tofile(pdd_file) NM = None if nm_file: NM = analysis_nm(wav, fs, f0s, PDD, verbose=verbose) # If asked, compress NM if nm_nbbnds: # If asked, compress the noise mask using a number of mel bands NM = sp.linbnd2fwbnd(NM, fs, dftlen, nm_nbbnds) if verbose > 0: print(' Output Noise Mask size={} in: {}'.format( NM.shape, nm_file)) NM.astype(np.float32).tofile(nm_file) if verbose > 2: plot_features(wav=wav, fs=fs, f0s=f0s, SPEC=SPEC, PDD=PDD, NM=NM)
def analysisf( fwav, shift=0.005, dftlen=4096, finf0txt=None, f0estimator='REAPER', f0_min=60, f0_max=600, ff0=None, f0_log=False, finf0bin=None, # input f0 file in binary fspec=None, spec_mceporder=None, # Mel-cepstral order for compressing the spectrogram (typically 59; None: no compression) spec_fwceporder=None, # Frequency warped cepstral order (very similar to above, just faster and less precise) (typically 59; None: no compression) spec_nbfwbnds=None, # Number of mel-bands in the compressed half log spectrogram (None: no compression) spec_nblinlogbnds=None, # Number of linear-bands in the compressed half log spectrogram (None: no compression) fpdd=None, pdd_mceporder=None, # Mel-cepstral order for compressing PDD spectrogram (typically 59; None: no compression) fnm=None, nm_nbfwbnds=None, # Number of mel-bands in the compressed noise mask (None: no compression) preproc_fs=None, # Resample the waveform preproc_hp=None, # Cut-off of high-pass filter (e.g. 20Hz) verbose=1): wav, fs, _ = sp.wavread(fwav) if verbose > 0: print( 'PML Analysis (dur={:.3f}s, fs={}Hz, f0 in [{},{}]Hz, shift={}s, dftlen={})' .format(len(wav) / float(fs), fs, f0_min, f0_max, shift, dftlen)) if (not preproc_fs is None) and (preproc_fs != fs): if verbose > 0: print( ' Resampling the waveform (new fs={}Hz)'.format(preproc_fs)) wav = sp.resample(wav, fs, preproc_fs, method=2, deterministic=True) fs = preproc_fs if not preproc_hp is None: if verbose > 0: print(' High-pass filter the waveform (cutt-off={}Hz)'.format( preproc_hp)) b, a = sig.butter(4, preproc_hp / (fs / 0.5), btype='high') wav = sig.filtfilt(b, a, wav) f0s = None if finf0txt: f0s = np.loadtxt(finf0txt) # read input f0 file in float32 (ljuvela) if finf0bin: f0s = np.fromfile(finf0bin, dtype=np.float32) f0s = analysis_f0postproc(wav, fs, f0s, f0_min=f0_min, f0_max=f0_max, shift=shift, f0estimator=f0estimator, verbose=verbose) if verbose > 2: f0sori = f0s.copy() if ff0: f0_values = f0s[:, 1] if verbose > 0: print(' Output F0 {} in: {}'.format(f0_values.shape, ff0)) if f0_log: f0_values = np.log(f0_values) if os.path.dirname(ff0) != '' and (not os.path.isdir( os.path.dirname(ff0))): os.mkdir(os.path.dirname(ff0)) f0_values.astype(np.float32).tofile(ff0) SPEC = None if fspec: SPEC = analysis_spec(wav, fs, f0s, shift=shift, dftlen=dftlen, verbose=verbose) if verbose > 2: SPECori = SPEC.copy() if not spec_mceporder is None: # pragma: no cover # Cannot test this because it needs SPTK SPEC = sp.spec2mcep(SPEC, sp.bark_alpha(fs), order=spec_mceporder) if not spec_fwceporder is None: SPEC = sp.loghspec2fwcep(np.log(abs(SPEC)), fs, order=spec_fwceporder) if not spec_nbfwbnds is None: SPEC = sp.linbnd2fwbnd(np.log(abs(SPEC)), fs, dftlen, spec_nbfwbnds) if not spec_nblinlogbnds is None: SPEC = np.log(abs(SPEC)) if verbose > 0: print(' Output Spectrogram size={} in: {}'.format( SPEC.shape, fspec)) if os.path.dirname(fspec) != '' and (not os.path.isdir( os.path.dirname(fspec))): os.mkdir(os.path.dirname(fspec)) SPEC.astype(np.float32).tofile(fspec) PDD = None if fpdd or fnm: PDD = analysis_pdd(wav, fs, f0s, dftlen=dftlen, verbose=verbose) if verbose > 2: PDDori = PDD.copy() if fpdd: if not pdd_mceporder is None: # pragma: no cover # Cannot test this because it needs SPTK # If asked, compress PDD PDD[PDD < 0.001] = 0.001 # From COVAREP PDD = sp.spec2mcep(PDD, sp.bark_alpha(fs), pdd_mceporder) if verbose > 0: print(' Output PDD size={} in: {}'.format(PDD.shape, fpdd)) if os.path.dirname(fpdd) != '' and (not os.path.isdir( os.path.dirname(fpdd))): os.mkdir(os.path.dirname(fpdd)) PDD.astype(np.float32).tofile(fpdd) NM = None if verbose > 2: NMori = None if fnm: NM = analysis_nm(wav, fs, f0s, PDD, verbose=verbose) if verbose > 2: NMori = NM.copy() # If asked, compress NM if nm_nbfwbnds: # If asked, compress the noise mask using a number of mel bands NM = sp.linbnd2fwbnd(NM, fs, dftlen, nm_nbfwbnds) if verbose > 0: print(' Output Noise Mask size={} in: {}'.format(NM.shape, fnm)) if os.path.dirname(fnm) != '' and (not os.path.isdir( os.path.dirname(fnm))): os.mkdir(os.path.dirname(fnm)) NM.astype(np.float32).tofile(fnm) if verbose > 2: plot_features(wav=wav, fs=fs, f0s=f0sori, SPEC=SPECori, PDD=PDDori, NM=NMori) # pragma: no cover
def analysisf(fwav , shift=0.005 , dftlen=4096 , inf0txt_file=None, f0_min=60, f0_max=600, f0_file=None , spec_file=None, spec_order=None # Mel-cepstral order for compressing the # spectrum (typically 59; None: no compression) , pdd_file=None, pdd_order=None # Mel-cepstral order for compressing PDD # spectrum (typically 59; None: no compression) , nm_file=None, nm_nbbnds=None # Number of mel-bands in the compressed mask # (None: no compression) , verbose=1): wav, fs, enc = sp.wavread(fwav) if verbose>0: print('PM Analysis (dur={:.3f}s, fs={}Hz, f0 in [{},{}]Hz, shift={}s, dftlen={})'.format(len(wav)/float(fs), fs, f0_min, f0_max, shift, dftlen)) f0s = None if inf0txt_file: f0s = np.loadtxt(inf0txt_file) f0s = analysis_f0postproc(wav, fs, f0s, f0_min=f0_min, f0_max=f0_max, shift=shift, verbose=verbose) if f0_file: if verbose>0: print(' Output F0 {} in: {}'.format(f0s[:,1].shape, f0_file)) f0s[:,1].astype(np.float32).tofile(f0_file) SPEC = None if spec_file: SPEC = analysis_spec(wav, fs, f0s, shift=shift, dftlen=dftlen, verbose=verbose) if not spec_order is None: SPEC = sp.spec2mcep(SPEC, sp.bark_alpha(fs), order=spec_order) if verbose>0: print(' Output Spectrogram size={} in: {}'.format(SPEC.shape, spec_file)) SPEC.astype(np.float32).tofile(spec_file) PDD = None if pdd_file or nm_file: PDD = analysis_pdd(wav, fs, f0s, dftlen=dftlen, verbose=verbose) if pdd_file: if not pdd_order is None: # If asked, compress PDD PDD[PDD<0.001] = 0.001 # From COVAREP PDD = sp.spec2mcep(PDD, sp.bark_alpha(fs), pdd_order) if verbose>0: print(' Output PDD size={} in: {}'.format(PDD.shape, pdd_file)) PDD.astype(np.float32).tofile(pdd_file) NM = None if nm_file: NM = analysis_nm(wav, fs, f0s, PDD, verbose=verbose) # If asked, compress NM if nm_nbbnds: # If asked, compress the noise mask using a number of mel bands NM = sp.linbnd2fwbnd(NM, fs, dftlen, nm_nbbnds) # Need to force to binary values because we don't use ambiguous values, # we use the binary version at synthesis time. NM[NM>=0.5] = 1.0 NM[NM<0.5] = 0.0 if verbose>0: print(' Output Noise Mask size={} in: {}'.format(NM.shape, nm_file)) NM.astype(np.float32).tofile(nm_file) if verbose>2: plot_features(wav=wav, fs=fs, f0s=f0s, SPEC=SPEC, PDD=PDD, NM=NM)
def synthesizef(fs, shift=0.005, dftlen=4096, ff0=None, flf0=None, fspec=None, flspec=None, ffwlspec=None, ffwcep=None, fmcep=None, fpdd=None, fmpdd=None, fnm=None, ffwnm=None, nm_cont=False, fsyn=None, verbose=1): ''' Call the synthesis from python using file inputs and outputs ''' if ff0: f0 = np.fromfile(ff0, dtype=np.float32) if flf0: f0 = np.fromfile(flf0, dtype=np.float32) f0[f0 > 0] = np.exp(f0[f0 > 0]) ts = (shift) * np.arange(len(f0)) f0s = np.vstack((ts, f0)).T if fspec: SPEC = np.fromfile(fspec, dtype=np.float32) SPEC = SPEC.reshape((len(f0), -1)) if flspec: SPEC = np.fromfile(flspec, dtype=np.float32) SPEC = np.exp(SPEC.reshape((len(f0), -1))) if ffwlspec: FWLSPEC = np.fromfile(ffwlspec, dtype=np.float32) FWLSPEC = FWLSPEC.reshape((len(f0), -1)) SPEC = np.exp(sp.fwbnd2linbnd(FWLSPEC, fs, dftlen, smooth=True)) if ffwcep: FWCEP = np.fromfile(ffwcep, dtype=np.float32) FWCEP = FWCEP.reshape((len(f0), -1)) SPEC = np.exp(sp.fwcep2loghspec(FWCEP, fs, dftlen)) if fmcep: # pragma: no cover # Cannot test this because it needs SPTK MCEP = np.fromfile(fmcep, dtype=np.float32) MCEP = MCEP.reshape((len(f0), -1)) SPEC = sp.mcep2spec(MCEP, sp.bark_alpha(fs), dftlen) NM = None pdd_thresh = 0.75 # For this value, see: # G. Degottex and D. Erro, "A uniform phase representation for the harmonic model in speech synthesis applications," EURASIP, Journal on Audio, Speech, and Music Processing - Special Issue: Models of Speech - In Search of Better Representations, vol. 2014, iss. 1, p. 38, 2014. if fpdd: PDD = np.fromfile(fpdd, dtype=np.float32) PDD = PDD.reshape((len(f0), -1)) NM = PDD.copy() NM[PDD < pdd_thresh] = 0.0 NM[PDD > pdd_thresh] = 1.0 if fmpdd: # pragma: no cover # Cannot test this because it needs SPTK MPDD = np.fromfile(fmpdd, dtype=np.float32) MPDD = MPDD.reshape((len(f0), -1)) PDD = sp.mcep2spec(MPDD, sp.bark_alpha(fs), dftlen) NM = PDD.copy() NM[PDD < pdd_thresh] = 0.0 NM[PDD > pdd_thresh] = 1.0 if fnm: NM = np.fromfile(fnm, dtype=np.float32) NM = NM.reshape((len(f0), -1)) if ffwnm: FWNM = np.fromfile(ffwnm, dtype=np.float32) FWNM = FWNM.reshape((len(f0), -1)) NM = sp.fwbnd2linbnd(FWNM, fs, dftlen) syn = synthesize(fs, f0s, SPEC, NM=NM, nm_cont=nm_cont, verbose=verbose) if fsyn: sp.wavwrite(fsyn, syn, fs, norm_max_ifneeded=True, verbose=verbose) return syn