def encode(a, pcm): """ Encode a speech waveform. The encoding framers (frames and pitch) pad the frames so that the first frame is centered on sample zero. This is consistent with STRAIGHT and SPTK (I hope!). At least, it means the pitch can have longer frame lengths and still align with the OLA'd frames. """ if opt.ola: frameSize = pcm.seconds_to_period(0.025, 'atleast') # 25ms frame size else: frameSize = framePeriod pitchSize = pcm.seconds_to_period(0.1, 'atmost') print "Encoding with period", framePeriod, "size", frameSize, \ "and pitch window", pitchSize # First the pitch as it's on the unaltered waveform. The frame # should be long with no window. 1024 at 16 kHz is 64 ms. pf = ssp.Frame(a, size=pitchSize, period=framePeriod) pitch, hnr = ssp.ACPitch(pf, pcm) # Pre-emphasis pre = ssp.parameter("Pre", None) if pre is not None: a = ssp.PoleFilter(a, pre) / 5 # Keep f around after the function so the decoder can do a # reference decoding on the real excitaton. global f f = ssp.Frame(a, size=frameSize, period=framePeriod) #aw = np.hanning(frameSize+1) aw = ssp.nuttall(frameSize+1) aw = np.delete(aw, -1) w = ssp.Window(f, aw) ac = ssp.Autocorrelation(w) lp = ssp.parameter('AR', 'levinson') if lp == 'levinson': ar, g = ssp.ARLevinson(ac, lpOrder[r]) elif lp == 'ridge': ar, g = ssp.ARRidge(ac, lpOrder[r], 0.03) elif lp == 'lasso': ar, g = ssp.ARLasso(ac, lpOrder[r], 5) elif lp == 'sparse': ar, g = ssp.ARSparse(w, lpOrder[r], ssp.parameter('Gamma', 1.414)) elif lp == 'student': ar, g = ssp.ARStudent(w, lpOrder[r], ssp.parameter('DoF', 50.0)) if False: fig = ssp.Figure(5, 1) #stddev = np.sqrt(kVar) sPlot = fig.subplot() sPlot.plot(pitch, 'c') #sPlot.plot(kPitch + stddev, 'b') #sPlot.plot(kPitch - stddev, 'b') sPlot.set_xlim(0, len(pitch)) sPlot.set_ylim(0, 500) plt.show() return (ar, g, pitch, hnr)
def get_pitch(gen_path, basefilename): (Fs, x) = io_wav.read(gen_path + basefilename + '.wav') assert Fs == 16000 pcm = ssp.PulseCodeModulation(Fs) frameSize = pcm.seconds_to_period(0.025, 'atleast') # 25ms Frame size pitchSize = pcm.seconds_to_period(0.1, 'atmost') # 100ms Pitch size pf = ssp.Frame(x, size=pitchSize, period=framePeriod) pitch, ac = ssp.ACPitch(pf, pcm, loPitch, hiPitch) # Initially pitch estimated # Pre-emphasis pre = ssp.parameter("Pre", None) if pre is not None: x = ssp.PoleFilter(x, pre) / 5 # Frame Splitting f = ssp.Frame(x, size=frameSize, period=framePeriod) # Windowing aw = ssp.nuttall(frameSize + 1) aw = np.delete(aw, -1) w = ssp.Window(f, aw) # Autocorrelation ac = ssp.Autocorrelation(w) if (len(ac) > len(pitch)): d = len(ac) - len(pitch) addon = np.ones(d) * pitch[-1] pitch = np.hstack((pitch, addon)) # Save pitch as binary lf0 = np.log(pitch) lf0.astype('float32').tofile(gen_path + basefilename + '.lf0') return pitch
global ti now = time.clock() elapsed = now-ti ti = now print(func, elapsed) import ssp import numpy as np import matplotlib.pyplot as plt lap("Import") # Load and do basic AR to reconstruct the spectrum pcm = ssp.PulseCodeModulation() wav = pcm.WavSource(file) print("File:", file, "rate:", pcm.rate, "size:", wav.size) if ssp.parameter("ZF", 0) == 1: wav = ssp.ZeroFilter(wav) f = ssp.Frame(wav, size=256, period=128) f = ssp.Window(f, np.hanning(256)) print("frame:", f.shape[0], "x", f.shape[1]) lap("Frame") e = ssp.Energy(f) p = ssp.Periodogram(f) lap("Periodogram") order = pcm.speech_ar_order() a = ssp.Autocorrelation(f) a, g = ssp.ARLevinson(a, order) lap("Levinson") ls = ssp.ARSpectrum(a, g, nSpec=128) lap("Spectrum")
global ti now = time.clock() elapsed = now-ti ti = now print func, elapsed import ssp import numpy as np import matplotlib.pyplot as plt lap("Import") # Load and do basic AR to reconstruct the spectrum pcm = ssp.PulseCodeModulation() wav = pcm.WavSource(file) print "File:", file, "rate:", pcm.rate, "size:", wav.size if ssp.parameter("ZF", 0) == 1: wav = ssp.ZeroFilter(wav) f = ssp.Frame(wav, size=256, period=128) f = ssp.Window(f, np.hanning(256)) print "frame:", f.shape[0], "x", f.shape[1] lap("Frame") e = ssp.Energy(f) p = ssp.Periodogram(f) lap("Periodogram") order = pcm.speech_ar_order() a = ssp.Autocorrelation(f) a, g = ssp.ARLevinson(a, order) lap("Levinson") ls = ssp.ARSpectrum(a, g, nSpec=128) lap("Spectrum")
def decode(tuple): """ Decode a speech waveform. """ (ark, g, pitch, hnr) = tuple print("Frame padding:", opt.padding) nFrames = len(ark) assert(len(g) == nFrames) assert(len(pitch) == nFrames) assert(len(hnr) == nFrames) # The original framer padded the ends so the number of samples to # synthesise is a bit less than you might think if opt.ola: frameSize = framePeriod * 2 nSamples = framePeriod * (nFrames-1) else: frameSize = framePeriod nSamples = frameSize * (nFrames-1) ex = opt.glottal if opt.glottal == 'cepgm' and (opt.encode or opt.decode or opt.pitch): order = ark.shape[-1] - 2 ar = ark[:,0:order] theta = ark[:,-2] magni = np.exp(ark[:,-1]) else: ar = ark # Use the original AR residual; it should be a very good reconstruction. if ex == 'ar': e = ssp.ARExcitation(f, ar, g) # Just noise. This is effectively a whisper synthesis. elif ex == 'noise': e = np.random.normal(size=(nFrames, frameSize)) # Just harmonics, and with a fixed F0. This is the classic robot # synthesis. elif ex == 'robot': ew = np.zeros(nSamples) period = int(1.0 / 200 * r) for i in range(0, len(ew), period): ew[i] = period e = ssp.Frame(ew, size=frameSize, period=framePeriod) # Synthesise harmonics plus noise in the ratio suggested by the HNR. elif ex == 'synth': # Harmonic part mperiod = int(1.0 / np.mean(pitch) * r) gm = ssp.GlottalModel(ssp.parameter('Pulse', 'impulse')) pr, pg = ssp.pulse_response(gm, pcm, period=mperiod, order=lpOrder[r]) h = np.zeros(nSamples) i = 0 frame = 0 while i < nSamples and frame < len(pitch): period = int(1.0 / pitch[frame] * r) if i + period > nSamples: break weight = np.sqrt(hnr[frame] / (hnr[frame] + 1)) h[i:i+period] = gm.pulse(period, pcm) * weight i += period frame = i // framePeriod h = ssp.ARExcitation(h, pr, 1.0) fh = ssp.Frame(h, size=frameSize, period=framePeriod, pad=opt.padding) # Noise part n = np.random.normal(size=nSamples) n = ssp.ZeroFilter(n, 1.0) # Include the radiation impedance fn = ssp.Frame(n, size=frameSize, period=framePeriod, pad=opt.padding) for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) hgain = ssp.parameter("HGain", 1.0) e = fn + fh * hgain # Like harmonics plus noise, but with explicit sinusoids instead of time # domain impulses. elif ex == 'sine': order = 20 sine = ssp.Harmonics(r, order) h = np.zeros(nSamples) for i in range(0, len(h)-framePeriod, framePeriod): frame = i // framePeriod period = int(1.0 / pitch[frame] * r) weight = np.sqrt(hnr[frame] / (hnr[frame] + 1)) h[i:i+framePeriod] = ( sine.sample(pitch[frame], framePeriod) * weight ) fh = ssp.Frame(h, size=frameSize, period=framePeriod, pad=opt.padding) n = np.random.normal(size=nSamples) fn = ssp.Frame(n, size=frameSize, period=framePeriod, pad=opt.padding) for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) e = fn + fh*10 # High order linear prediction. Synthesise the harmonics using noise to # excite a high order polynomial with roots resembling harmonics. elif ex == 'holp': # Some noise n = np.random.normal(size=nSamples) fn = ssp.Frame(n, size=frameSize, period=framePeriod) # Use the noise to excite a high order AR model fh = np.ndarray(fn.shape) for i in range(len(fn)): hoar = ssp.ARHarmonicPoly(pitch[i], r, 0.7) fh[i] = ssp.ARResynthesis(fn[i], hoar, 1.0 / linalg.norm(hoar)**2) print(i, pitch[i], linalg.norm(hoar), np.min(fh[i]), np.max(fh[i])) print(' ', np.min(hoar), np.max(hoar)) # fh[i] *= np.sqrt(r / pitch[i]) / linalg.norm(fh[i]) # fh[i] *= np.sqrt(hnr[i] / (hnr[i] + 1)) # Weight the noise as for the other methods for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) e = fh # fn + fh*30 # Shaped excitation. The pulses are shaped by a filter to have a # rolloff, then added to the noise. The resulting signal is # flattened using AR. elif ex == 'shaped': # Harmonic part gm = ssp.GlottalModel(ssp.parameter('Pulse', 'impulse')) gm.angle = pcm.hertz_to_radians(np.mean(pitch)*0.5) h = np.zeros(nSamples) i = 0 frame = 0 while i < nSamples and frame < len(pitch): period = int(1.0 / pitch[frame] * r) if i + period > nSamples: break weight = np.sqrt(hnr[frame] / (hnr[frame] + 1)) h[i:i+period] = gm.pulse(period, pcm) * weight i += period frame = i // framePeriod # Filter to mimic the glottal pulse hfilt = ssp.parameter("HFilt", None) hpole1 = ssp.parameter("HPole1", 0.98) hpole2 = ssp.parameter("HPole2", 0.8) angle = pcm.hertz_to_radians(np.mean(pitch)) * ssp.parameter("Angle", 1.0) if hfilt == 'pp': h = ssp.ZeroFilter(h, 1.0) h = ssp.PolePairFilter(h, hpole1, angle) fh = ssp.Frame(h, size=frameSize, period=framePeriod) # Noise part n = np.random.normal(size=nSamples) zero = ssp.parameter("NoiseZero", 1.0) n = ssp.ZeroFilter(n, zero) # Include the radiation impedance npole = ssp.parameter("NPole", None) nf = ssp.parameter("NoiseFreq", 4000) if npole is not None: n = ssp.PolePairFilter(n, npole, pcm.hertz_to_radians(nf)) fn = ssp.Frame(n, size=frameSize, period=framePeriod, pad=opt.padding) for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) # Combination assert(len(fh) == len(fn)) hgain = ssp.parameter("HGain", 1.0) e = fn + fh * hgain hnw = np.hanning(frameSize) for i in range(len(e)): ep = ssp.Window(e[i], hnw) #ep = e[i] eac = ssp.Autocorrelation(ep) ea, eg = ssp.ARLevinson(eac, order=lpOrder[r]) e[i] = ssp.ARExcitation(e[i], ea, eg) elif ex == 'ceplf': omega, alpha = ssp.glottal_pole_lf( f, pcm, pitch, hnr, visual=(opt.graphic == "ceplf")) epsilon = ssp.parameter("Epsilon", 5000.0) h = np.zeros(nSamples) i = 0 frame = 0 while i < nSamples and frame < len(pitch): period = int(1.0 / pitch[frame] * r) if i + period > nSamples: break weight = np.sqrt(hnr[frame] / (hnr[frame] + 1)) pu = np.zeros((period)) T0 = pcm.period_to_seconds(period) print(T0,) Te = ssp.lf_te(T0, alpha[frame], omega[frame], epsilon) if Te: pu = ssp.pulse_lf(pu, T0, Te, alpha[frame], omega[frame], epsilon) h[i:i+period] = pu * weight i += period frame = i // framePeriod fh = ssp.Frame(h, size=frameSize, period=framePeriod, pad=opt.padding) # Noise part n = np.random.normal(size=nSamples) zero = ssp.parameter("NoiseZero", 1.0) n = ssp.ZeroFilter(n, zero) # Include the radiation impedance fn = ssp.Frame(n, size=frameSize, period=framePeriod, pad=opt.padding) for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) # Combination assert(len(fh) == len(fn)) hgain = ssp.parameter("HGain", 1.0) e = fn + fh * hgain hnw = np.hanning(frameSize) for i in range(len(e)): ep = ssp.Window(e[i], hnw) #ep = e[i] eac = ssp.Autocorrelation(ep) ea, eg = ssp.ARLevinson(eac, order=lpOrder[r]) e[i] = ssp.ARExcitation(e[i], ea, eg) elif ex == 'cepgm': # Infer the unstable poles via complex cepstrum, then build an explicit # glottal model. if not (opt.encode or opt.decode or opt.pitch): theta, magni = ssp.glottal_pole_gm( f, pcm, pitch, hnr, visual=(opt.graphic == "cepgm")) h = np.zeros(nSamples) i = 0 frame = 0 while i < nSamples and frame < len(pitch): period = int(1.0 / pitch[frame] * r) if i + period > nSamples: break h[i] = 1 # np.random.normal() ** 2 i += period frame = i // framePeriod fh = ssp.Frame(h, size=frameSize, period=framePeriod, pad=opt.padding) gl = ssp.MinPhaseGlottis() for i in range(len(fh)): # This is minimum phase; the glotter will invert if required gl.setpolepair(np.abs(magni[frame]), theta[frame]) fh[i] = gl.glotter(fh[i]) if linalg.norm(fh[i]) > 1e-6: fh[i] *= np.sqrt(len(fh[i])) / linalg.norm(fh[i]) weight = np.sqrt(hnr[i] / (hnr[i] + 1)) fh[i] *= weight if (opt.graphic == "h"): fig = ssp.Figure(1, 1) hPlot = fig.subplot() hPlot.plot(h, 'r') fig.show() # Noise part n = np.random.normal(size=nSamples) zero = ssp.parameter("NoiseZero", 1.0) n = ssp.ZeroFilter(n, zero) # Include the radiation impedance fn = ssp.Frame(n, size=frameSize, period=framePeriod, pad=opt.padding) for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) # Combination assert(len(fh) == len(fn)) hgain = ssp.parameter("HGain", 1.0) e = fn + fh * hgain hnw = np.hanning(frameSize) for i in range(len(e)): ep = ssp.Window(e[i], hnw) #ep = e[i] eac = ssp.Autocorrelation(ep) ea, eg = ssp.ARLevinson(eac, order=lpOrder[r]) e[i] = ssp.ARExcitation(e[i], ea, eg) else: print("Unknown synthesis method") exit if opt.excitation: s = e.flatten('C')/frameSize else: s = ssp.ARResynthesis(e, ar, g) if opt.ola: # Asymmetric window for OLA sw = np.hanning(frameSize+1) sw = np.delete(sw, -1) s = ssp.Window(s, sw) s = ssp.OverlapAdd(s) else: s = s.flatten('C') gain = ssp.parameter("Gain", 1.0) return s * gain
def decode(tuple): """ Decode a speech waveform. """ (ark, g, pitch, hnr) = tuple print("Frame padding:", opt.padding) nFrames = len(ark) assert (len(g) == nFrames) assert (len(pitch) == nFrames) assert (len(hnr) == nFrames) # The original framer padded the ends so the number of samples to # synthesise is a bit less than you might think if opt.ola: frameSize = framePeriod * 2 nSamples = framePeriod * (nFrames - 1) else: frameSize = framePeriod nSamples = frameSize * (nFrames - 1) ex = opt.glottal if opt.glottal == 'cepgm' and (opt.encode or opt.decode or opt.pitch): order = ark.shape[-1] - 2 ar = ark[:, 0:order] theta = ark[:, -2] magni = np.exp(ark[:, -1]) else: ar = ark # Use the original AR residual; it should be a very good reconstruction. if ex == 'ar': e = ssp.ARExcitation(f, ar, g) # Just noise. This is effectively a whisper synthesis. elif ex == 'noise': e = np.random.normal(size=(nFrames, frameSize)) # Just harmonics, and with a fixed F0. This is the classic robot # synthesis. elif ex == 'robot': ew = np.zeros(nSamples) period = int(1.0 / 200 * r) for i in range(0, len(ew), period): ew[i] = period e = ssp.Frame(ew, size=frameSize, period=framePeriod) # Synthesise harmonics plus noise in the ratio suggested by the HNR. elif ex == 'synth': # Harmonic part mperiod = int(1.0 / np.mean(pitch) * r) gm = ssp.GlottalModel(ssp.parameter('Pulse', 'impulse')) pr, pg = ssp.pulse_response(gm, pcm, period=mperiod, order=lpOrder[r]) h = np.zeros(nSamples) i = 0 frame = 0 while i < nSamples and frame < len(pitch): period = int(1.0 / pitch[frame] * r) if i + period > nSamples: break weight = np.sqrt(hnr[frame] / (hnr[frame] + 1)) h[i:i + period] = gm.pulse(period, pcm) * weight i += period frame = i // framePeriod h = ssp.ARExcitation(h, pr, 1.0) fh = ssp.Frame(h, size=frameSize, period=framePeriod, pad=opt.padding) # Noise part n = np.random.normal(size=nSamples) n = ssp.ZeroFilter(n, 1.0) # Include the radiation impedance fn = ssp.Frame(n, size=frameSize, period=framePeriod, pad=opt.padding) for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) hgain = ssp.parameter("HGain", 1.0) e = fn + fh * hgain # Like harmonics plus noise, but with explicit sinusoids instead of time # domain impulses. elif ex == 'sine': order = 20 sine = ssp.Harmonics(r, order) h = np.zeros(nSamples) for i in range(0, len(h) - framePeriod, framePeriod): frame = i // framePeriod period = int(1.0 / pitch[frame] * r) weight = np.sqrt(hnr[frame] / (hnr[frame] + 1)) h[i:i + framePeriod] = (sine.sample(pitch[frame], framePeriod) * weight) fh = ssp.Frame(h, size=frameSize, period=framePeriod, pad=opt.padding) n = np.random.normal(size=nSamples) fn = ssp.Frame(n, size=frameSize, period=framePeriod, pad=opt.padding) for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) e = fn + fh * 10 # High order linear prediction. Synthesise the harmonics using noise to # excite a high order polynomial with roots resembling harmonics. elif ex == 'holp': # Some noise n = np.random.normal(size=nSamples) fn = ssp.Frame(n, size=frameSize, period=framePeriod) # Use the noise to excite a high order AR model fh = np.ndarray(fn.shape) for i in range(len(fn)): hoar = ssp.ARHarmonicPoly(pitch[i], r, 0.7) fh[i] = ssp.ARResynthesis(fn[i], hoar, 1.0 / linalg.norm(hoar)**2) print(i, pitch[i], linalg.norm(hoar), np.min(fh[i]), np.max(fh[i])) print(' ', np.min(hoar), np.max(hoar)) # fh[i] *= np.sqrt(r / pitch[i]) / linalg.norm(fh[i]) # fh[i] *= np.sqrt(hnr[i] / (hnr[i] + 1)) # Weight the noise as for the other methods for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) e = fh # fn + fh*30 # Shaped excitation. The pulses are shaped by a filter to have a # rolloff, then added to the noise. The resulting signal is # flattened using AR. elif ex == 'shaped': # Harmonic part gm = ssp.GlottalModel(ssp.parameter('Pulse', 'impulse')) gm.angle = pcm.hertz_to_radians(np.mean(pitch) * 0.5) h = np.zeros(nSamples) i = 0 frame = 0 while i < nSamples and frame < len(pitch): period = int(1.0 / pitch[frame] * r) if i + period > nSamples: break weight = np.sqrt(hnr[frame] / (hnr[frame] + 1)) h[i:i + period] = gm.pulse(period, pcm) * weight i += period frame = i // framePeriod # Filter to mimic the glottal pulse hfilt = ssp.parameter("HFilt", None) hpole1 = ssp.parameter("HPole1", 0.98) hpole2 = ssp.parameter("HPole2", 0.8) angle = pcm.hertz_to_radians(np.mean(pitch)) * ssp.parameter( "Angle", 1.0) if hfilt == 'pp': h = ssp.ZeroFilter(h, 1.0) h = ssp.PolePairFilter(h, hpole1, angle) fh = ssp.Frame(h, size=frameSize, period=framePeriod) # Noise part n = np.random.normal(size=nSamples) zero = ssp.parameter("NoiseZero", 1.0) n = ssp.ZeroFilter(n, zero) # Include the radiation impedance npole = ssp.parameter("NPole", None) nf = ssp.parameter("NoiseFreq", 4000) if npole is not None: n = ssp.PolePairFilter(n, npole, pcm.hertz_to_radians(nf)) fn = ssp.Frame(n, size=frameSize, period=framePeriod, pad=opt.padding) for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) # Combination assert (len(fh) == len(fn)) hgain = ssp.parameter("HGain", 1.0) e = fn + fh * hgain hnw = np.hanning(frameSize) for i in range(len(e)): ep = ssp.Window(e[i], hnw) #ep = e[i] eac = ssp.Autocorrelation(ep) ea, eg = ssp.ARLevinson(eac, order=lpOrder[r]) e[i] = ssp.ARExcitation(e[i], ea, eg) elif ex == 'ceplf': omega, alpha = ssp.glottal_pole_lf(f, pcm, pitch, hnr, visual=(opt.graphic == "ceplf")) epsilon = ssp.parameter("Epsilon", 5000.0) h = np.zeros(nSamples) i = 0 frame = 0 while i < nSamples and frame < len(pitch): period = int(1.0 / pitch[frame] * r) if i + period > nSamples: break weight = np.sqrt(hnr[frame] / (hnr[frame] + 1)) pu = np.zeros((period)) T0 = pcm.period_to_seconds(period) print(T0, ) Te = ssp.lf_te(T0, alpha[frame], omega[frame], epsilon) if Te: pu = ssp.pulse_lf(pu, T0, Te, alpha[frame], omega[frame], epsilon) h[i:i + period] = pu * weight i += period frame = i // framePeriod fh = ssp.Frame(h, size=frameSize, period=framePeriod, pad=opt.padding) # Noise part n = np.random.normal(size=nSamples) zero = ssp.parameter("NoiseZero", 1.0) n = ssp.ZeroFilter(n, zero) # Include the radiation impedance fn = ssp.Frame(n, size=frameSize, period=framePeriod, pad=opt.padding) for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) # Combination assert (len(fh) == len(fn)) hgain = ssp.parameter("HGain", 1.0) e = fn + fh * hgain hnw = np.hanning(frameSize) for i in range(len(e)): ep = ssp.Window(e[i], hnw) #ep = e[i] eac = ssp.Autocorrelation(ep) ea, eg = ssp.ARLevinson(eac, order=lpOrder[r]) e[i] = ssp.ARExcitation(e[i], ea, eg) elif ex == 'cepgm': # Infer the unstable poles via complex cepstrum, then build an explicit # glottal model. if not (opt.encode or opt.decode or opt.pitch): theta, magni = ssp.glottal_pole_gm(f, pcm, pitch, hnr, visual=(opt.graphic == "cepgm")) h = np.zeros(nSamples) i = 0 frame = 0 while i < nSamples and frame < len(pitch): period = int(1.0 / pitch[frame] * r) if i + period > nSamples: break h[i] = 1 # np.random.normal() ** 2 i += period frame = i // framePeriod fh = ssp.Frame(h, size=frameSize, period=framePeriod, pad=opt.padding) gl = ssp.MinPhaseGlottis() for i in range(len(fh)): # This is minimum phase; the glotter will invert if required gl.setpolepair(np.abs(magni[frame]), theta[frame]) fh[i] = gl.glotter(fh[i]) if linalg.norm(fh[i]) > 1e-6: fh[i] *= np.sqrt(len(fh[i])) / linalg.norm(fh[i]) weight = np.sqrt(hnr[i] / (hnr[i] + 1)) fh[i] *= weight if (opt.graphic == "h"): fig = ssp.Figure(1, 1) hPlot = fig.subplot() hPlot.plot(h, 'r') fig.show() # Noise part n = np.random.normal(size=nSamples) zero = ssp.parameter("NoiseZero", 1.0) n = ssp.ZeroFilter(n, zero) # Include the radiation impedance fn = ssp.Frame(n, size=frameSize, period=framePeriod, pad=opt.padding) for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) # Combination assert (len(fh) == len(fn)) hgain = ssp.parameter("HGain", 1.0) e = fn + fh * hgain hnw = np.hanning(frameSize) for i in range(len(e)): ep = ssp.Window(e[i], hnw) #ep = e[i] eac = ssp.Autocorrelation(ep) ea, eg = ssp.ARLevinson(eac, order=lpOrder[r]) e[i] = ssp.ARExcitation(e[i], ea, eg) else: print("Unknown synthesis method") exit if opt.excitation: s = e.flatten('C') / frameSize else: s = ssp.ARResynthesis(e, ar, g) if opt.ola: # Asymmetric window for OLA sw = np.hanning(frameSize + 1) sw = np.delete(sw, -1) s = ssp.Window(s, sw) s = ssp.OverlapAdd(s) else: s = s.flatten('C') gain = ssp.parameter("Gain", 1.0) return s * gain
# from optparse import OptionParser op = OptionParser() (option, arg) = op.parse_args() if (len(arg) < 1): print "Need one arg" exit(1) file = arg[0] import ssp import numpy as np # Load and process pcm = ssp.PulseCodeModulation() a = pcm.WavSource(file) if (ssp.parameter('Pre', None)): a = ssp.ZeroFilter(a) framePeriod = pcm.seconds_to_period(0.01) frameSize = pcm.seconds_to_period(0.02, 'atleast') f = ssp.Frame(a, size=frameSize, period=framePeriod) w = ssp.nuttall(frameSize+1) w = np.delete(w, -1) wf = ssp.Window(f, w) type = ssp.parameter('Type', 'psd') if type == 'psd': p = ssp.Periodogram(wf) p = p[:,:p.shape[1]/2+1] elif type == 'ar': a = ssp.Autocorrelation(wf) a, g = ssp.ARLevinson(a, pcm.speech_ar_order()) p = ssp.ARSpectrum(a, g, nSpec=128)
framePeriod = 80 lpOrder = 10 if pcm.rate == 16000: frameSize = 400 framePeriod = 160 lpOrder = 12 # Basic preprocessing g = np.ndarray((0)) a = ssp.ZeroFilter(a) f = ssp.Frame(a, size=frameSize, period=framePeriod, pad=False) f = ssp.Window(f, ssp.nuttall(frameSize)) # Next part depends on user frontend = ssp.parameter("FrontEnd", "ar") if frontend == "ar": a = ssp.Autocorrelation(f) a = ssp.AutocorrelationAllPassWarp(a, alpha=ssp.mel[pcm.rate], size=lpOrder + 1) a, g = ssp.ARLevinson(a, lpOrder) # ridge = Parameter('Ridge', 0.1) # a, g = ARRidge(a, lpOrder, ridge) # a, g = ARLasso(a, lpOrder, ridge) elif frontend == "snr": a = ssp.Periodogram(f) n = ssp.Noise(a) a = ssp.SNRSpectrum(a, n * 0.1) a = ssp.Autocorrelation(a, input='psd') a, g = ssp.ARLevinson(a, lpOrder)
def decode((ar, g, pitch, hnr)): """ Decode a speech waveform. """ nFrames = len(ar) assert(len(g) == nFrames) assert(len(pitch) == nFrames) assert(len(hnr) == nFrames) # The original framer padded the ends so the number of samples to # synthesise is a bit less than you might think if opt.ola: frameSize = framePeriod * 2 nSamples = framePeriod * (nFrames-1) else: frameSize = framePeriod nSamples = frameSize * (nFrames-1) ex = ssp.parameter('Excitation', 'synth') # Use the original AR residual; it should be a very good # reconstruction. if ex == 'ar': e = ssp.ARExcitation(f, ar, g) # Just noise. This is effectively a whisper synthesis. elif ex == 'noise': e = np.random.normal(size=f.shape) # Just harmonics, and with a fixed F0. This is the classic robot # syntheisis. elif ex == 'robot': ew = np.zeros(nSamples) period = int(1.0 / 200 * r) for i in range(0, len(ew), period): ew[i] = period e = ssp.Frame(ew, size=frameSize, period=framePeriod) # Synthesise harmonics plus noise in the ratio suggested by the # HNR. elif ex == 'synth': # Harmonic part mperiod = int(1.0 / np.mean(pitch) * r) gm = ssp.GlottalModel(ssp.parameter('Pulse', 'impulse')) pr, pg = ssp.pulse_response(gm, pcm, period=mperiod, order=lpOrder[r]) h = np.zeros(nSamples) i = 0 frame = 0 while i < nSamples and frame < len(pitch): period = int(1.0 / pitch[frame] * r) if i + period > nSamples: break weight = np.sqrt(hnr[frame] / (hnr[frame] + 1)) h[i:i+period] = gm.pulse(period, pcm) * weight i += period frame = i // framePeriod h = ssp.ARExcitation(h, pr, 1.0) fh = ssp.Frame(h, size=frameSize, period=framePeriod) # Noise part n = np.random.normal(size=nSamples) n = ssp.ZeroFilter(n, 1.0) # Include the radiation impedance fn = ssp.Frame(n, size=frameSize, period=framePeriod) for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) hgain = ssp.parameter("HGain", 1.0) e = fn + fh * hgain # Like harmonics plus noise, but with explicit sinusoids instead # of time domain impulses. elif ex == 'sine': order = 20 sine = ssp.Harmonics(r, order) h = np.zeros(nSamples) for i in range(0, len(h)-framePeriod, framePeriod): frame = i // framePeriod period = int(1.0 / pitch[frame] * r) weight = np.sqrt(hnr[frame] / (hnr[frame] + 1)) h[i:i+framePeriod] = ( sine.sample(pitch[frame], framePeriod) * weight ) fh = ssp.Frame(h, size=frameSize, period=framePeriod) n = np.random.normal(size=nSamples) fn = ssp.Frame(n, size=frameSize, period=framePeriod) for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) e = fn + fh*10 # High order linear prediction. Synthesise the harmonics using # noise to excite a high order polynomial with roots resembling # harmonics. elif ex == 'holp': # Some noise n = np.random.normal(size=nSamples) fn = ssp.Frame(n, size=frameSize, period=framePeriod) # Use the noise to excite a high order AR model fh = np.ndarray(fn.shape) for i in range(len(fn)): hoar = ssp.ARHarmonicPoly(pitch[i], r, 0.7) fh[i] = ssp.ARResynthesis(fn[i], hoar, 1.0 / linalg.norm(hoar)**2) print i, pitch[i], linalg.norm(hoar), np.min(fh[i]), np.max(fh[i]) print ' ', np.min(hoar), np.max(hoar) # fh[i] *= np.sqrt(r / pitch[i]) / linalg.norm(fh[i]) # fh[i] *= np.sqrt(hnr[i] / (hnr[i] + 1)) # Weight the noise as for the other methods for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) e = fh # fn + fh*30 # Shaped excitation. The pulses are shaped by a filter to have a # rolloff, then added to the noise. The resulting signal is # flattened using AR. elif ex == 'shaped': # Harmonic part gm = ssp.GlottalModel(ssp.parameter('Pulse', 'impulse')) gm.angle = pcm.hertz_to_radians(np.mean(pitch)*0.5) h = np.zeros(nSamples) i = 0 frame = 0 while i < nSamples and frame < len(pitch): period = int(1.0 / pitch[frame] * r) if i + period > nSamples: break weight = np.sqrt(hnr[frame] / (hnr[frame] + 1)) h[i:i+period] = gm.pulse(period, pcm) * weight i += period frame = i // framePeriod # Filter to mimic the glottal pulse hfilt = ssp.parameter("HFilt", None) hpole1 = ssp.parameter("HPole1", 0.98) hpole2 = ssp.parameter("HPole2", 0.8) angle = pcm.hertz_to_radians(np.mean(pitch)) * ssp.parameter("Angle", 1.0) # Noise part n = np.random.normal(size=nSamples) zero = ssp.parameter("NoiseZero", 1.0) n = ssp.ZeroFilter(n, zero) # Include the radiation impedance npole = ssp.parameter("NPole", None) nf = ssp.parameter("NoiseFreq", 4000) if npole is not None: n = ssp.PolePairFilter(n, npole, pcm.hertz_to_radians(nf)) fn = ssp.Frame(n, size=frameSize, period=framePeriod) for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) # Combination assert(len(fh) == len(fn)) hgain = ssp.parameter("HGain", 1.0) e = fn + fh * hgain hnw = np.hanning(frameSize) for i in range(len(e)): ep = ssp.Window(e[i], hnw) #ep = e[i] eac = ssp.Autocorrelation(ep) ea, eg = ssp.ARLevinson(eac, order=lpOrder[r]) e[i] = ssp.ARExcitation(e[i], ea, eg) else: print "Unknown synthesis method" exit if opt.excitation: s = e.flatten('C')/frameSize else: s = ssp.ARResynthesis(e, ar, g) if opt.ola: # Asymmetric window for OLA sw = np.hanning(frameSize+1) sw = np.delete(sw, -1) s = ssp.Window(s, sw) s = ssp.OverlapAdd(s) else: s = s.flatten('C') gain = ssp.parameter("Gain", 1.0) return s * gain
framePeriod = 80 lpOrder = 10 if pcm.rate == 16000: frameSize = 400 framePeriod = 160 lpOrder = 12 # Basic preprocessing g = np.ndarray((0)) a = ssp.ZeroFilter(a) f = ssp.Frame(a, size=frameSize, period=framePeriod, pad=False) f = ssp.Window(f, ssp.nuttall(frameSize)) # Next part depends on user frontend = ssp.parameter("FrontEnd", "ar") if frontend == "ar": a = ssp.Autocorrelation(f) a = ssp.AutocorrelationAllPassWarp(a, alpha=ssp.mel[pcm.rate], size=lpOrder+1) a, g = ssp.ARLevinson(a, lpOrder) # ridge = Parameter('Ridge', 0.1) # a, g = ARRidge(a, lpOrder, ridge) # a, g = ARLasso(a, lpOrder, ridge) elif frontend == "snr": a = ssp.Periodogram(f) n = ssp.Noise(a) a = ssp.SNRSpectrum(a, n * 0.1) a = ssp.Autocorrelation(a, input='psd') a, g = ssp.ARLevinson(a, lpOrder) a = ssp.AutocorrelationAllPassWarp(a, alpha=ssp.mel[pcm.rate],
def decode((ar, g, pitch, hnr)): """ Decode a speech waveform. """ nFrames = len(ar) assert(len(g) == nFrames) assert(len(pitch) == nFrames) assert(len(hnr) == nFrames) # The original framer padded the ends so the number of samples to # synthesise is a bit less than you might think if opt.ola: frameSize = framePeriod * 2 nSamples = framePeriod * (nFrames-1) else: frameSize = framePeriod nSamples = frameSize * (nFrames-1) ex = ssp.parameter('Excitation', 'synth') # Use the original AR residual; it should be a very good # reconstruction. if ex == 'ar': e = ssp.ARExcitation(f, ar, g) # Just noise. This is effectively a whisper synthesis. elif ex == 'noise': e = np.random.normal(size=f.shape) # Just harmonics, and with a fixed F0. This is the classic robot # syntheisis. elif ex == 'robot': ew = np.zeros(nSamples) period = int(1.0 / 200 * r) for i in range(0, len(ew), period): ew[i] = period e = ssp.Frame(ew, size=frameSize, period=framePeriod) # Synthesise harmonics plus noise in the ratio suggested by the # HNR. elif ex == 'synth': # Harmonic part mperiod = int(1.0 / np.mean(pitch) * r) gm = ssp.GlottalModel(ssp.parameter('Pulse', 'impulse')) pr, pg = ssp.pulse_response(gm, pcm, period=mperiod, order=lpOrder[r]) h = np.zeros(nSamples) i = 0 frame = 0 while i < nSamples and frame < len(pitch): period = int(1.0 / pitch[frame] * r) if i + period > nSamples: break weight = np.sqrt(hnr[frame] / (hnr[frame] + 1)) h[i:i+period] = gm.pulse(period, pcm) * weight i += period frame = i // framePeriod h = ssp.ARExcitation(h, pr, 1.0) fh = ssp.Frame(h, size=frameSize, period=framePeriod) # Noise part n = np.random.normal(size=nSamples) n = ssp.ZeroFilter(n, 1.0) # Include the radiation impedance fn = ssp.Frame(n, size=frameSize, period=framePeriod) for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) hgain = ssp.parameter("HGain", 1.0) e = fn + fh * hgain # Like harmonics plus noise, but with explicit sinusoids instead # of time domain impulses. elif ex == 'sine': order = 20 sine = ssp.Harmonics(r, order) h = np.zeros(nSamples) for i in range(0, len(h)-framePeriod, framePeriod): frame = i // framePeriod period = int(1.0 / pitch[frame] * r) weight = np.sqrt(hnr[frame] / (hnr[frame] + 1)) h[i:i+framePeriod] = ( sine.sample(pitch[frame], framePeriod) * weight ) fh = ssp.Frame(h, size=frameSize, period=framePeriod) n = np.random.normal(size=nSamples) fn = ssp.Frame(n, size=frameSize, period=framePeriod) for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) e = fn + fh*10 # High order linear prediction. Synthesise the harmonics using # noise to excite a high order polynomial with roots resembling # harmonics. elif ex == 'holp': # Some noise n = np.random.normal(size=nSamples) fn = ssp.Frame(n, size=frameSize, period=framePeriod) # Use the noise to excite a high order AR model fh = np.ndarray(fn.shape) for i in range(len(fn)): hoar = ssp.ARHarmonicPoly(pitch[i], r, 0.7) fh[i] = ssp.ARResynthesis(fn[i], hoar, 1.0 / linalg.norm(hoar)**2) print i, pitch[i], linalg.norm(hoar), np.min(fh[i]), np.max(fh[i]) print ' ', np.min(hoar), np.max(hoar) # fh[i] *= np.sqrt(r / pitch[i]) / linalg.norm(fh[i]) # fh[i] *= np.sqrt(hnr[i] / (hnr[i] + 1)) # Weight the noise as for the other methods for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) e = fh # fn + fh*30 # Shaped excitation. The pulses are shaped by a filter to have a # rolloff, then added to the noise. The resulting signal is # flattened using AR. elif ex == 'shaped': # Harmonic part gm = ssp.GlottalModel(ssp.parameter('Pulse', 'impulse')) gm.angle = pcm.hertz_to_radians(np.mean(pitch)*0.5) h = np.zeros(nSamples) i = 0 frame = 0 while i < nSamples and frame < len(pitch): period = int(1.0 / pitch[frame] * r) if i + period > nSamples: break weight = np.sqrt(hnr[frame] / (hnr[frame] + 1)) h[i:i+period] = gm.pulse(period, pcm) * weight i += period frame = i // framePeriod # Filter to mimic the glottal pulse hfilt = ssp.parameter("HFilt", None) hpole1 = ssp.parameter("HPole1", 0.98) hpole2 = ssp.parameter("HPole2", 0.8) angle = pcm.hertz_to_radians(np.mean(pitch)) * ssp.parameter("Angle", 1.0) if hfilt == 'pp': h = ssp.ZeroFilter(h, 1.0) h = ssp.PolePairFilter(h, hpole1, angle) if hfilt == 'g': h = ssp.GFilter(h, hpole1, angle, hpole2) if hfilt == 'p': h = ssp.PFilter(h, hpole1, angle, hpole2) fh = ssp.Frame(h, size=frameSize, period=framePeriod) # Noise part n = np.random.normal(size=nSamples) zero = ssp.parameter("NoiseZero", 1.0) n = ssp.ZeroFilter(n, zero) # Include the radiation impedance npole = ssp.parameter("NPole", None) nf = ssp.parameter("NoiseFreq", 4000) if npole is not None: n = ssp.PolePairFilter(n, npole, pcm.hertz_to_radians(nf)) fn = ssp.Frame(n, size=frameSize, period=framePeriod) for i in range(len(fn)): fn[i] *= np.sqrt(1.0 / (hnr[i] + 1)) # Combination assert(len(fh) == len(fn)) hgain = ssp.parameter("HGain", 1.0) e = fn + fh * hgain hnw = np.hanning(frameSize) for i in range(len(e)): ep = ssp.Window(e[i], hnw) #ep = e[i] eac = ssp.Autocorrelation(ep) ea, eg = ssp.ARLevinson(eac, order=lpOrder[r]) e[i] = ssp.ARExcitation(e[i], ea, eg) else: print "Unknown synthesis method" exit if opt.excitation: s = e.flatten('C')/frameSize else: s = ssp.ARResynthesis(e, ar, g) if opt.ola: # Asymmetric window for OLA sw = np.hanning(frameSize+1) sw = np.delete(sw, -1) s = ssp.Window(s, sw) s = ssp.OverlapAdd(s) else: s = s.flatten('C') gain = ssp.parameter("Gain", 1.0) return s * gain