def computeSNR(inputFile, window, M, N, H): """ Input: inputFile (string): wav file name including the path window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window length (odd positive integer) N (integer): fft size (power of two, > M) H (integer): hop size for the stft computation Output: The function should return a python tuple of both the SNR values (SNR1, SNR2) SNR1 and SNR2 are floats. """ ## your code here def energy(mag): e = np.sum((10**(mag / 20))**2) return e (fs, x) = UF.wavread(inputFile) w = get_window(window, M) mX, pX = STFT.stftAnal(x, fs, w, N, H) y = STFT.stftSynth(mX, pX, M, H) n = x - y[:x.size] n2 = x[w.size:-w.size] - y[:x.size][w.size:-w.size] mN, pN = STFT.stftAnal(n, fs, w, N, H) mN2, pN2 = STFT.stftAnal(n2, fs, w, N, H) snr1 = 10 * np.log10(energy(mX) / energy(mN)) snr2 = 10 * np.log10(energy(mX) / energy(mN2)) return snr1, snr2
def computeSNR(inputFile, window, M, N, H): """ Input: inputFile (string): wav file name including the path window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window length (odd positive integer) N (integer): fft size (power of two, > M) H (integer): hop size for the stft computation Output: The function should return a python tuple of both the SNR values (SNR1, SNR2) SNR1 and SNR2 are floats. """ ## your code here def energy(mag): e = np.sum((10 ** (mag / 20)) ** 2) return e (fs, x) = UF.wavread(inputFile) w = get_window(window, M) mX, pX = STFT.stftAnal(x, fs, w, N, H) y = STFT.stftSynth(mX, pX, M, H) n = x - y[:x.size] n2 = x[w.size:-w.size] - y[:x.size][w.size:-w.size] mN, pN = STFT.stftAnal(n, fs, w, N, H) mN2, pN2 = STFT.stftAnal(n2, fs, w, N, H) snr1 = 10 * np.log10(energy(mX) / energy(mN)) snr2 = 10 * np.log10(energy(mX) / energy(mN2)) return snr1, snr2
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ (fs,x) = UF.wavread(inputFile) w = get_window(window, M) (xmX, xpX) = stft.stftAnal(x, fs, w, N, H) kLow1 = 0 kLow2 = 0 while (True): kLow2 += 1 if( (kLow2 < N*(fLow2)/float(fs)) & (kLow2 > N*(fLow2)/float(fs) - 1.0 ) ): break kHigh1 = 0 while (True): kHigh1 += 1 if( (kHigh1 < N*(fHigh1)/float(fs)) & (kHigh1 > N*(fHigh1)/float(fs) - 1.0 ) ): break kHigh2 = 0 while (True): kHigh2 += 1 if( (kHigh2 < N*(fHigh2)/float(fs)) & (kHigh2 > N*(fHigh2)/float(fs) - 1.0 ) ): break nHops = int(xmX.shape[0]) out = np.zeros((nHops,2)) i = 0 while i < nHops: subxmX = xmX[i,:] subLowxmX = subxmX[kLow1+1:kLow2+1] subLowxmX = 10**(subLowxmX/20) eSignalLow = sum(subLowxmX**2) out[i,0] = 10.0*np.log10(eSignalLow) subHighxmX = subxmX[kHigh1+1:kHigh2+1] subHighxmX = 10**(subHighxmX/20) eSignalHigh = sum(subHighxmX**2) out[i,1] = 10.0*np.log10(eSignalHigh) i += 1 return out
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ ### your code here w = get_window(window, M, False) fs, x = UF.wavread(inputFile) xmX, xpX = stft.stftAnal(x, w, N, H) xmX = 10**(xmX / 20) k3000 = int(np.floor(3000 * N / fs)) + 1 k10000 = int(np.floor(10000 * N / fs)) + 1 band1 = xmX[:, 1:k3000] band2 = xmX[:, k3000:k10000] sband1 = np.multiply(band1, band1) sband2 = np.multiply(band2, band2) eband1 = np.sum(sband1, axis=1) eband2 = np.sum(sband2, axis=1) dbeband1 = 10 * np.log10(eband1) dbeband2 = 10 * np.log10(eband2) result = np.vstack((dbeband1, dbeband2)) result = np.transpose(result) return result
def sineODF(file='../../../../../audioDSP_course/assignments/sms-tools/sounds/piano.wav'): fs, x = UF.wavread(file) # set params: M = 1024 # window size H = int(M/3) # hop size t = -80.0 #treshold (dB??) window = 'blackman' # window type fftSize = int(pow(2, np.ceil(np.log2(M)))) # size of FFT N = fftSize maxnSines = 10 # maximum simultaneous sines minSineDur = 0.1 # minimal duration of sines freqDevOffset = 30 # min(??) frequency deviation at 0Hz freqDevSlope = 0.001 # slope increase of min freq dev. w = get_window(window, M) # get analysis window tStamps = genTimeStamps(len(x), M, fs, H) # generate timestamp return? fTrackEst, mTrackEst, pTreckEst = SM.sineModelAnal(x, fs, w, fftSize, H, t, maxnSines, minSineDur, freqDevOffset, freqDevSlope) fTrackTrue = genTrueFreqTracks(tStamps) # get true freq. tracks # plotting: mX, pX = stft.stftAnal(x, fs, w, fftSize, H) maxplotfreq = 1500.0 binFreq = fs*np.arange(N*maxplotfreq/fs)/N plt.pcolormesh(tStamps, binFreq, np.transpose(mX[:,:N*maxplotfreq/fs+1]),cmap = 'hot_r') # plt.plot(fTrackTrue, 'o-', color = 'c', linewidth=3.0) plt.plot(tStamps, fTrackEst, color = 'y', linewidth=2.0) # plt.legend(('True f1', 'True f2', 'Estimated f1', 'Estimated f2')) plt.xlabel('Time (s)') plt.ylabel('Frequency (Hz)') plt.autoscale(tight=True) return fTrackEst
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ def calculateEnergy(mY): eDB = 10 * np.log10(np.sum((10**(mY / 20))**2, axis=1)) return eDB (fs, x) = UF.wavread(inputFile) lowerBin = int(np.ceil(float(3000) * N / fs)) upperBin = int(np.ceil(float(10000) * N / fs)) w = get_window(window, M) mX, pX = stft.stftAnal(x, w, N, H) lowerBand = np.transpose(np.transpose(mX)[1:lowerBin]) upperBand = np.transpose(np.transpose(mX)[lowerBin:upperBin]) eDB_low = calculateEnergy(lowerBand) eDB_high = calculateEnergy(upperBand) engEnv = np.append([eDB_low], [eDB_high], axis=0) engEnv = np.transpose(engEnv) return engEnv
def plotSpectogramF0Segments(x, fs, w, N, H, f0, segments): """ Code for plotting the f0 contour on top of the spectrogram """ # frequency range to plot maxplotfreq = 1000.0 fontSize = 16 fig = plt.figure() ax = fig.add_subplot(111) mX, pX = stft.stftAnal(x, fs, w, N, H) #using same params as used for analysis mX = np.transpose(mX[:,:int(N*(maxplotfreq/fs))+1]) timeStamps = np.arange(mX.shape[1])*H/float(fs) binFreqs = np.arange(mX.shape[0])*fs/float(N) plt.pcolormesh(timeStamps, binFreqs, mX) plt.plot(timeStamps, f0, color = 'k', linewidth=5) for ii in range(segments.shape[0]): plt.plot(timeStamps[segments[ii,0]:segments[ii,1]], f0[segments[ii,0]:segments[ii,1]], color = '#A9E2F3', linewidth=1.5) plt.autoscale(tight=True) plt.ylabel('Frequency (Hz)', fontsize = fontSize) plt.xlabel('Time (s)', fontsize = fontSize) plt.legend(('f0','segments')) xLim = ax.get_xlim() yLim = ax.get_ylim() ax.set_aspect((xLim[1]-xLim[0])/(2.0*(yLim[1]-yLim[0]))) plt.autoscale(tight=True) plt.show()
def plotSpectogramF0Segments(x, fs, w, N, H, f0, segments): """ Code for plotting the f0 contour on top of the spectrogram """ # frequency range to plot maxplotfreq = 1000.0 fontSize = 16 fig = plt.figure() ax = fig.add_subplot(111) mX, pX = stft.stftAnal(x, w, N, H) #using same params as used for analysis mX = np.transpose(mX[:,:int(N*(maxplotfreq/fs))+1]) timeStamps = np.arange(mX.shape[1])*H/float(fs) binFreqs = np.arange(mX.shape[0])*fs/float(N) plt.pcolormesh(timeStamps, binFreqs, mX) plt.plot(timeStamps, f0, color = 'k', linewidth=5) for ii in range(segments.shape[0]): plt.plot(timeStamps[segments[ii,0]:segments[ii,1]], f0[segments[ii,0]:segments[ii,1]], color = '#A9E2F3', linewidth=1.5) plt.autoscale(tight=True) plt.ylabel('Frequency (Hz)', fontsize = fontSize) plt.xlabel('Time (s)', fontsize = fontSize) plt.legend(('f0','segments')) xLim = ax.get_xlim() yLim = ax.get_ylim() ax.set_aspect((xLim[1]-xLim[0])/(2.0*(yLim[1]-yLim[0]))) plt.autoscale(tight=True) plt.show()
def computeSNR(inputFile, window, M, N, H): """ Input: inputFile (string): wav file name including the path window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window length (odd positive integer) N (integer): fft size (power of two, > M) H (integer): hop size for the stft computation Output: The function should return a python tuple of both the SNR values (SNR1, SNR2) SNR1 and SNR2 are floats. """ def energy(x): e = np.sum(np.abs(x)**2) return e fs, x = UF.wavread(inputFile) w = get_window(window, M, False) mX, pX = stft.stftAnal(x, w, N, H) y = stft.stftSynth(mX, pX, M, H) n = x - y[:x.size] n2 = x[w.size:-w.size] - y[:x.size][w.size:-w.size] SNR1 = 10 * np.log10(energy(y) / energy(n)) SNR2 = 10 * np.log10(energy(y) / energy(n2)) return SNR1, SNR2
def computeODF(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd integer value) N (integer): fft size (power of two, bigger or equal than than M) H (integer): hop size for the STFT computation Output: The function should return a numpy array with two columns, where the first column is the ODF computed on the low frequency band and the second column is the ODF computed on the high frequency band. ODF[:,0]: ODF computed in band 0 < f < 3000 Hz ODF[:,1]: ODF computed in band 3000 < f < 10000 Hz """ ### your code here fs, x = UF.wavread(inputFile) w = get_window(window, M) mX = stft.stftAnal(x, fs, w, N, H)[0] X = 10 ** (mX / 20.0) b3k = int(N*3000.0/fs) b10k = int(N*10000.0/fs) o3k = odf(X[:, 1:b3k+1]) o10k = odf(X[:, b3k+1:b10k+1]) return np.column_stack((o3k, o10k))
def computeSNR(inputFile, window, M, N, H): """ Input: inputFile (string): wav file name including the path window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window length (odd positive integer) N (integer): fft size (power of two, > M) H (integer): hop size for the stft computation Output: The function should return a python tuple of both the SNR values (SNR1, SNR2) SNR1 and SNR2 are floats. """ ## your code here w = get_window(window, M, False) (fs, x) = UF.wavread(inputFile) (mX, pX) = stft.stftAnal(x, w, N, H) y = stft.stftSynth(mX, pX, M, H) noise = x-y[:x.size] # get energy of x enX1 = np.sum(abs(x)*abs(x)) enNoise = np.sum(abs(noise)*abs(noise)) enX2 = np.sum(abs(x[M:-M])*abs(x[M:-M])) enNoise2 = np.sum(abs(noise[M:-M])*abs(noise[M:-M])) SNR1 = 10*np.log10(enX1/enNoise) SNR2 = 10*np.log10(enX2/enNoise2) return (SNR1, SNR2)
def main(inputFile , window='blackman', M=601, N=1024, t=-100, minSineDur=0.1, nH=100, minf0=350, maxf0=700, f0et=5, harmDevSlope=0.01): # size of fft used in synthesis Ns = 512 # hop size (has to be 1/4 of Ns) H = 128 # read input sound (fs, x) = UF.wavread(inputFile) # compute analysis window w = get_window(window, M) # find harmonics and residual hfreq, hmag, hphase, xr = HPR.hprModelAnal(x, fs, w, N, H, t, minSineDur, nH, minf0, maxf0, f0et, harmDevSlope) # compute spectrogram of residual mXr, pXr = STFT.stftAnal(xr, fs, w, N, H) # synthesize hpr model y, yh = HPR.hprModelSynth(hfreq, hmag, hphase, xr, Ns, H, fs) # output sound file (monophonic with sampling rate of 44100) outputFileSines = 'output_sounds/' + os.path.basename(inputFile)[:-4] + '_hprModel_sines.wav' outputFileResidual = 'output_sounds/' + os.path.basename(inputFile)[:-4] + '_hprModel_residual.wav' outputFile = 'output_sounds/' + os.path.basename(inputFile)[:-4] + '_hprModel.wav' # write sounds files for harmonics, residual, and the sum UF.wavwrite(yh, fs, outputFileSines) UF.wavwrite(xr, fs, outputFileResidual) UF.wavwrite(y, fs, outputFile)
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ fs, x = UF.wavread(inputFile) w = get_window(window, M, False) xmX, xpX = stft.stftAnal(x, w, N, H) result = [] for mX in xmX: mXLinear = pow(10, mX / 20) freq = np.arange(mXLinear.size) * fs / N mXLow = np.where((freq > 0) & (freq < 3000), mXLinear, 0) mXHigh = np.where((freq > 3000) & (freq < 10000), mXLinear, 0) ELow = np.sum(np.square(abs(mXLow))) EHigh = np.sum(np.square(abs(mXHigh))) ELowDB = 10 * np.log10(ELow) EHighDB = 10 * np.log10(EHigh) result.append([ELowDB, EHighDB]) return np.asarray(result)
def computeSNR(inputFile, window, M, N, H): """ Input: inputFile (string): wav file name including the path window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window length (odd positive integer) N (integer): fft size (power of two, > M) H (integer): hop size for the stft computation Output: The function should return a python tuple of both the SNR values (SNR1, SNR2) SNR1 and SNR2 are floats. """ ## your code here (fs,x) = UF.wavread(inputFile) w = get_window(window, M) mX, pX = stft.stftAnal(x, w, N, H) y = stft.stftSynth(mX, pX, M, H) n1 = x - y[:x.size] # Get to where signal lies in y: test the dimension of y n2 = x[M:-M] - y[:x.size][M:-M] def calculate_energy(x): e = np.sum(x**2) return e SNR1 = 10*np.log10(calculate_energy(x)/calculate_energy(n1)) SNR2 = 10*np.log10(calculate_energy(x)/calculate_energy(n2)) return (SNR1, SNR2)
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ fs, x = UF.wavread(inputFile) w = get_window(window, M) mX, pX = stft.stftAnal(x, w, N, H) mX_lin = 10**(mX / 20) T = float(N) / fs bin_freqs = np.arange(N) / T low_cutoff = 3000 high_cutoff = 10000 k1 = np.argmin(bin_freqs < low_cutoff) k2 = np.argmax(bin_freqs > low_cutoff) k3 = np.argmin(bin_freqs < high_cutoff) return np.vstack((bandE(mX_lin, 1, k1), bandE(mX_lin, k2, k3))).T
def compute_eng_env(inputFile, window, M, N, H): fs, x = UF.wavread(inputFile) w = get_window(window, M) mX, pX = stft.stftAnal(x, w, N, H) mXlinear = 10.0**(mX / 20.0) # Get an array of indices for bins within each band range: # Using list comprehension: # band_low_bins = np.array([ k for k in range(N) if 0 < k * fs / N < 3000.0]) # band_high_bins = np.array([ k for k in range(N) if 3000.0 < k * fs / N < 10000.0]) # Using np.where(): bins = np.arange(0, N) * fs / N band_low_bins = np.where((bins > 0) & (bins < 3000.0))[0] band_high_bins = np.where((bins > 3000) & (bins < 10000.0))[0] num_frames = mX.shape[0] env = np.zeros(shape=(num_frames, 2)) for frame in range(num_frames): env[frame, 0] = 10.0 * np.log10(sum(mXlinear[frame, band_low_bins]**2)) env[frame, 1] = 10.0 * np.log10(sum(mXlinear[frame, band_high_bins]**2)) plot_spectrogram_with_energy_envelope( mX, env, M, N, H, fs, 'mX ({}), M={}, N={}, H={}'.format(inputFile, M, N, H)) return fs, mX, env
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ ### your code here fs, x = UF.wavread(inputFile) w = get_window(window, M) mx, px = stft.stftAnal(x, w, N, H) mx = 10**(mx / 20.) res = [] for i in range(0, len(mx)): low = 0. high = 0. for j in range(1, len(mx[i])): rate = fs * j / (N + 0.0) if rate < 3000: low += mx[i][j]**2 elif rate < 10000: #print j high += mx[i][j]**2 res.append([10 * np.log10(low), 10 * np.log10(high)]) return np.array(res)
def main(inputFile = '../../sounds/piano.wav', window = 'hamming', M = 1024, N = 1024, H = 512): """ analysis/synthesis using the STFT inputFile: input sound file (monophonic with sampling rate of 44100) window: analysis window type (choice of rectangular, hanning, hamming, blackman, blackmanharris) M: analysis window size N: fft size (power of two, bigger or equal than M) H: hop size (at least 1/2 of analysis window size to have good overlap-add) """ # read input sound (monophonic with sampling rate of 44100) fs, x = UF.wavread(inputFile) # compute analysis window w = get_window(window, M) # compute the magnitude and phase spectrogram mX, pX = STFT.stftAnal(x, fs, w, N, H) # perform the inverse stft y = STFT.stftSynth(mX, pX, M, H) # output sound file (monophonic with sampling rate of 44100) outputFile = 'output_sounds/' + os.path.basename(inputFile)[:-4] + '_stft.wav' # write the sound resulting from the inverse stft UF.wavwrite(y, fs, outputFile) return x, fs, mX, pX, y
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ ### your code here fs,x = UF.wavread(inputFile) w = get_window(window,M) mX,pX = stft.stftAnal(x,w,N,H) mX = pow(10,mX/20.) band_energy = np.zeros((len(mX),2)) for frm_idx in range(len(mX)): frm = mX[frm_idx] for k in range(len(frm)): cur_f = k*44100/N if cur_f > 0 and cur_f < 3000: band_energy[frm_idx,0] += (frm[k]*frm[k]) elif cur_f > 3000 and cur_f < 10000: band_energy[frm_idx,1] += (frm[k]*frm[k]) band_energy = 10.0*np.log10(band_energy) return band_energy
def computeODF(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd integer value) N (integer): fft size (power of two, bigger or equal than than M) H (integer): hop size for the STFT computation Output: The function should return a numpy array with two columns, where the first column is the ODF computed on the low frequency band and the second column is the ODF computed on the high frequency band. ODF[:,0]: ODF computed in band 0 < f < 3000 Hz ODF[:,1]: ODF computed in band 3000 < f < 10000 Hz """ ### your code here windowing = get_window(window, M) (fs, x) = UF.wavread(inputFile) mX, pX = stft.stftAnal(x, fs, windowing, N, H) bin0 = 1 bin3000 = np.floor(3000.0*N/fs) bin10000 = np.floor(10000.0*N/fs) bin3000up = np.ceil(3000.0*N/fs) ODF = np.zeros((mX.shape[0], 2)) prevODF3000 = 0.0 prevODF10000 = 0.0 for i in range(mX.shape[0]): env3000 = np.sum(np.square(10**(mX[i,1:bin3000+1] / 20))) env3000db = 10 * np.log10(env3000) odf3000 = env3000db - prevODF3000 prevODF3000 = env3000db if odf3000 <= 0.0: odf3000 = 0.0 ODF[i,0] = odf3000 env10000 = np.sum(np.square(10**(mX[i,bin3000up:bin10000+1] / 20))) env10000db = 10 * np.log10(env10000) odf10000 = env10000db - prevODF10000 prevODF10000 = env10000db if odf10000 <= 0.0: odf10000 = 0.0 ODF[i,1] = odf10000 return ODF
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ ### your code here # first do stft. fftbins = M % 2 == 0 w = get_window(window, M, fftbins) fs, x = UF.wavread(inputFile) mX, pX = stft.stftAnal(x, w, N, H) # get bin index for 3000hz and 10000hz lowfreq_idx = int(3000 * N / fs) highfreq_idx = int(10000 * N / fs) print("low freq ", lowfreq_idx) print("high freq ", highfreq_idx) mX_linear = 10**(mX / 20) # compute low freq band energies. mX_lowfreq = mX_linear[:, 1:lowfreq_idx + 1] E_lowfreq = 10 * np.log10(np.sum(mX_lowfreq**2, axis=1, keepdims=True)) # compute high freq band energies mX_highfreq = mX_linear[:, lowfreq_idx + 1:highfreq_idx + 1] E_highfreq = 10 * np.log10(np.sum(mX_highfreq**2, axis=1, keepdims=True)) print(mX_highfreq.shape) plt.figure(1, figsize=(9.5, 6)) plt.subplot(211) numFrames = int(mX[:, 0].size) frmTime = H * np.arange(numFrames) / float(fs) binFreq = np.arange(N / 2 + 1) * float(fs) / N plt.pcolormesh(frmTime, binFreq, np.transpose(mX)) plt.title('mX, M={}, N={}, H={}'.format(M, N, H)) plt.autoscale(tight=True) plt.subplot(212) plt.plot(frmTime, E_lowfreq, label="low freq") plt.plot(frmTime, E_highfreq, label="high freq") plt.tight_layout() plt.legend() plt.show() return np.concatenate([E_lowfreq, E_highfreq], axis=1)
def chirpTracker(inputFile='../../sounds/chirp-150-190-linear.wav'): """ Input: inputFile (string) = wav file including the path Output: M (int) = Window length H (int) = hop size in samples tStamps (numpy array) = A Kx1 numpy array of time stamps at which the frequency components were estimated fTrackEst (numpy array) = A Kx2 numpy array of estimated frequency values, one row per time frame, one column per component fTrackTrue (numpy array) = A Kx2 numpy array of true frequency values, one row per time frame, one column per component K is the number of frames """ # Analysis parameters: Modify values of the parameters marked XX M = 3300 # Window size in samples ### Go through the code below and understand it, do not modify anything ### H = 128 # Hop size in samples N = int(pow(2, np.ceil(np.log2(M)))) # FFT Size, power of 2 larger than M t = -80.0 # threshold window = 'blackman' # Window type maxnSines = 2 # Maximum number of sinusoids at any time frame minSineDur = 0.0 # minimum duration set to zero to not do tracking freqDevOffset = 30 # minimum frequency deviation at 0Hz freqDevSlope = 0.001 # slope increase of minimum frequency deviation fs, x = UF.wavread(inputFile) # read input sound w = get_window(window, M) # Compute analysis window tStamps = genTimeStamps(x.size, M, fs, H) # Generate the tStamps to return # analyze the sound with the sinusoidal model fTrackEst, mTrackEst, pTrackEst = SM.sineModelAnal(x, fs, w, N, H, t, maxnSines, minSineDur, freqDevOffset, freqDevSlope) fTrackTrue = genTrueFreqTracks( tStamps) # Generate the true frequency tracks tailF = 20 # Compute mean estimation error. 20 frames at the beginning and end not used to compute error meanErr = np.mean(np.abs(fTrackTrue[tailF:-tailF, :] - fTrackEst[tailF:-tailF, :]), axis=0) print "Mean estimation error = " + str( meanErr) + ' Hz' # Print the error to terminal # Plot the estimated and true frequency tracks mX, pX = stft.stftAnal(x, w, N, H) maxplotfreq = 1500.0 binFreq = fs * np.arange(N * maxplotfreq / fs) / N plt.pcolormesh(tStamps, binFreq, np.transpose(mX[:, :N * maxplotfreq / fs + 1]), cmap='hot_r') plt.plot(tStamps, fTrackTrue, 'o-', color='c', linewidth=3.0) plt.plot(tStamps, fTrackEst, color='y', linewidth=2.0) plt.legend(('True f1', 'True f2', 'Estimated f1', 'Estimated f2')) plt.xlabel('Time (s)') plt.ylabel('Frequency (Hz)') plt.autoscale(tight=True) plt.show() return M, H, tStamps, fTrackEst, fTrackTrue # Output returned
def computeSNR(inputFile, window, M, N, H): """ Input: inputFile (string): wav file name including the path window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window length (odd positive integer) N (integer): fft size (power of two, > M) H (integer): hop size for the stft computation Output: The function should return a python tuple of both the SNR values (SNR1, SNR2) SNR1 and SNR2 are floats. """ # Calculate the SNR after synthesis and analysis STFT w = get_window(window, M) # SNR (signal to noise ratio) = 10log10(Energy of signal / Energy of noise) (fs, x) = UF.wavread(inputFile) # Do analysis and synthesis mX, pX = stft.stftAnal(x, w, N, H) y = stft.stftSynth(mX, pX, M, H) # Resizing y so we can calculate energy of noise resized_y = y[:x.size] # Calculating the noise of part 1 and 2 noise1 = x - resized_y noise2 = x[w.size:-w.size] - resized_y[w.size:-w.size] # Analyse both noises mNoise1, pNoise1 = stft.stftAnal(noise1, w, N, H) mNoise2, pNoise2 = stft.stftAnal(noise2, w, N, H) energyInput = energy_computation(mX) energyNoise1 = energy_computation(mNoise1) energyNoise2 = energy_computation(mNoise2) SNR1 = 10 * np.log10(energyInput / energyNoise1) SNR2 = 10 * np.log10(energyInput / energyNoise2) return SNR1, SNR2
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ ### your code here def energy(mag): e = 10 * np.log10(np.sum((10 ** (mag / 20)) ** 2, axis=1)) return e (fs, x) = UF.wavread(inputFile) border_bin = int(np.ceil(float(3000) * N / fs)) max_bin = int(np.ceil(float(10000) * N / fs)) w = get_window(window, M) mX, pX = STFT.stftAnal(x, fs, w, N, H) low = np.transpose(np.transpose(mX)[1:border_bin]) high = np.transpose(np.transpose(mX)[border_bin:max_bin]) e_low = energy(low) e_high = energy(high) envs = np.append([e_low], [e_high], axis=0) envs = np.transpose(envs) # draw graph plt.figure(1, figsize=(9.5, 6)) plt.subplot(211) numFrames = mX.shape[0] frmTime = H*np.arange(numFrames)/float(fs) binFreq = np.arange(mX.shape[1])*float(fs)/N plt.pcolormesh(frmTime, binFreq, np.transpose(mX)) plt.title('mX ({0}), M={1}, N={2}, H={3}'.format(inputFile, M, N, H)) plt.autoscale(tight=True) plt.subplot(212) plt.plot(frmTime, e_low, color="blue", label="row") plt.plot(frmTime, e_high, color="red", label="high") plt.title('Energy of Envelopes') plt.autoscale(tight=True) plt.tight_layout() plt.show() return envs
def mainlobeTracker(inputFile='../../sounds/sines-440-602-hRange.wav'): """ Input: inputFile (string): wav file including the path Output: window (string): The window type used for analysis t (float) = peak picking threshold (negative dB) tStamps (numpy array) = A Kx1 numpy array of time stamps at which the frequency components were estimated fTrackEst = A Kx2 numpy array of estimated frequency values, one row per time frame, one column per component fTrackTrue = A Kx2 numpy array of true frequency values, one row per time frame, one column per component """ # Analysis parameters: Modify values of the parameters marked XX window = 'blackman' # Window type t = -80 # threshold (negative dB) ### Go through the code below and understand it, do not modify anything ### M = 2047 # Window size N = 4096 # FFT Size H = 128 # Hop size in samples maxnSines = 2 minSineDur = 0.02 freqDevOffset = 10 freqDevSlope = 0.001 # read input sound fs, x = UF.wavread(inputFile) w = get_window(window, M) # Compute analysis window tStamps = genTimeStamps(x.size, M, fs, H) # Generate the tStamps to return # analyze the sound with the sinusoidal model fTrackEst, mTrackEst, pTrackEst = SM.sineModelAnal(x, fs, w, N, H, t, maxnSines, minSineDur, freqDevOffset, freqDevSlope) fTrackTrue = genTrueFreqTracks( tStamps) # Generate the true frequency tracks tailF = 20 # Compute mean estimation error. 20 frames at the beginning and end not used to compute error meanErr = np.mean(np.abs(fTrackTrue[tailF:-tailF, :] - fTrackEst[tailF:-tailF, :]), axis=0) print "Mean estimation error = " + str( meanErr) + ' Hz' # Print the error to terminal # Plot the estimated and true frequency tracks mX, pX = stft.stftAnal(x, w, N, H) maxplotfreq = 900.0 binFreq = fs * np.arange(N * maxplotfreq / fs) / N plt.pcolormesh(tStamps, binFreq, np.transpose(mX[:, :N * maxplotfreq / fs + 1]), cmap='hot_r') plt.plot(tStamps, fTrackTrue, 'o-', color='c', linewidth=3.0) plt.plot(tStamps, fTrackEst, color='y', linewidth=2.0) plt.legend(('True f1', 'True f2', 'Estimated f1', 'Estimated f2')) plt.xlabel('Time (s)') plt.ylabel('Frequency (Hz)') plt.autoscale(tight=True) plt.show() return window, float(t), tStamps, fTrackEst, fTrackTrue # Output returned
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ ### your code here def energy(mag): e = 10 * np.log10(np.sum((10**(mag / 20))**2, axis=1)) return e (fs, x) = UF.wavread(inputFile) border_bin = int(np.ceil(float(3000) * N / fs)) max_bin = int(np.ceil(float(10000) * N / fs)) w = get_window(window, M) mX, pX = STFT.stftAnal(x, fs, w, N, H) low = np.transpose(np.transpose(mX)[1:border_bin]) high = np.transpose(np.transpose(mX)[border_bin:max_bin]) e_low = energy(low) e_high = energy(high) envs = np.append([e_low], [e_high], axis=0) envs = np.transpose(envs) # draw graph plt.figure(1, figsize=(9.5, 6)) plt.subplot(211) numFrames = mX.shape[0] frmTime = H * np.arange(numFrames) / float(fs) binFreq = np.arange(mX.shape[1]) * float(fs) / N plt.pcolormesh(frmTime, binFreq, np.transpose(mX)) plt.title('mX ({0}), M={1}, N={2}, H={3}'.format(inputFile, M, N, H)) plt.autoscale(tight=True) plt.subplot(212) plt.plot(frmTime, e_low, color="blue", label="row") plt.plot(frmTime, e_high, color="red", label="high") plt.title('Energy of Envelopes') plt.autoscale(tight=True) plt.tight_layout() plt.show() return envs
def main(inputFile='../../sounds/bendir.wav', window='hamming', M=2001, N=2048, t=-80, minSineDur=0.02, maxnSines=150, freqDevOffset=10, freqDevSlope=0.001): """ inputFile: input sound file (monophonic with sampling rate of 44100) window: analysis window type (rectangular, hanning, hamming, blackman, blackmanharris) M: analysis window size N: fft size (power of two, bigger or equal than M) t: magnitude threshold of spectral peaks minSineDur: minimum duration of sinusoidal tracks maxnSines: maximum number of parallel sinusoids freqDevOffset: frequency deviation allowed in the sinusoids from frame to frame at frequency 0 freqDevSlope: slope of the frequency deviation, higher frequencies have bigger deviation """ # size of fft used in synthesis Ns = 512 # hop size (has to be 1/4 of Ns) H = 128 # read input sound (fs, x) = UF.wavread(inputFile) # compute analysis window w = get_window(window, M) # perform sinusoidal plus residual analysis tfreq, tmag, tphase, xr = SPR.sprModelAnal(x, fs, w, N, H, t, minSineDur, maxnSines, freqDevOffset, freqDevSlope) # compute spectrogram of residual mXr, pXr = STFT.stftAnal(xr, fs, w, N, H) # sum sinusoids and residual y, ys = SPR.sprModelSynth(tfreq, tmag, tphase, xr, Ns, H, fs) # output sound file (monophonic with sampling rate of 44100) outputFileSines = 'output_sounds/' + os.path.basename( inputFile)[:-4] + '_sprModel_sines.wav' outputFileResidual = 'output_sounds/' + os.path.basename( inputFile)[:-4] + '_sprModel_residual.wav' outputFile = 'output_sounds/' + os.path.basename( inputFile)[:-4] + '_sprModel.wav' # write sounds files for sinusoidal, residual, and the sum UF.wavwrite(ys, fs, outputFileSines) UF.wavwrite(xr, fs, outputFileResidual) UF.wavwrite(y, fs, outputFile) return x, fs, mXr, tfreq, y
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ def energy(mY): eDB = 10 * np.log10(np.sum((10**(mY / 20))**2, axis=1)) return eDB (fs, x) = UF.wavread(inputFile) low_bound = int(np.ceil(float(3000) * N / fs)) high_bound = int(np.ceil(float(10000) * N / fs)) w = get_window(window, M) mX, pX = stft.stftAnal(x, w, N, H) low_band = np.transpose(np.transpose(mX)[1:low_bound]) high_band = np.transpose(np.transpose(mX)[low_bound:high_bound]) eDB_low = energy(low_band) eDB_high = energy(high_band) engEnv = np.append([eDB_low], [eDB_high], axis=0) engEnv = np.transpose(engEnv) plt.subplot(211) numFrames = int(mX[:, 0].size) frmTime = H * np.arange(numFrames) / float(fs) binFreq = np.arange(N / 2 + 1) * float(fs) / N plt.pcolormesh(frmTime, binFreq, np.transpose(mX)) plt.title('Spectrogram') plt.ylabel('frequency (Hz)') plt.autoscale(tight=True) plt.subplot(212) plt.plot(frmTime, eDB_low, color="blue", label="low") plt.plot(frmTime, eDB_high, color="green", label="high") plt.title('Energy Envelopes') plt.ylabel('Energy (dB)') plt.autoscale(tight=True) plt.tight_layout() plt.savefig('engEnv.png') plt.show() return engEnv
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ # read the input sound fs, signal = UF.wavread(inputFile) # compute window w = get_window(window, M) # compute the spectrum magnitude_frames, p = stft.stftAnal(signal, fs, w, N, H) # compute the boundaries for the energy bands k_3000 = 3000 * float(N) / fs k_10000 = 10000 * float(N) / fs # set up variables to hold the energy values # energy_low = 0 # energy_high = 0 # set up array to hold the energy values output_frame = [] # loop through array and collect energy for frame in magnitude_frames: energy_low = 0 energy_high = 0 L = len(frame) for i in range(1, L): if i < k_3000: energy_low += (10 ** (frame[i] / 20)) ** 2 elif i < k_10000 and i > k_3000: energy_high += (10 ** (frame[i] / 20)) ** 2 # compute decibel value of energy energy_low = 10 * np.log10(energy_low) energy_high = 10 * np.log10(energy_high) output_frame.append([energy_low, energy_high]) return np.array(output_frame)
def computeAndPlotF0(inputFile = '../../sounds/piano.wav'): """ Function to estimate fundamental frequency (f0) in an audio signal using TWM. Input: inputFile (string): wav file including the path """ window='hamming' M=2048 N=2048 H=256 f0et=5.0 t=-80 minf0=100 maxf0=300 fs, x = UF.wavread(inputFile) #reading inputFile w = get_window(window, M) #obtaining analysis window f0 = f0Detection(x, fs, w, N, H, t, minf0, maxf0, f0et) #estimating F0 ## Code for plotting the f0 contour on top of the spectrogram # frequency range to plot maxplotfreq = 500.0 fontSize = 16 plot = 1 fig = plt.figure() ax = fig.add_subplot(111) mX, pX = stft.stftAnal(x, fs, w, N, H) #using same params as used for analysis mX = np.transpose(mX[:,:int(N*(maxplotfreq/fs))+1]) timeStamps = np.arange(mX.shape[1])*H/float(fs) binFreqs = np.arange(mX.shape[0])*fs/float(N) plt.pcolormesh(timeStamps, binFreqs, mX) plt.plot(timeStamps, f0, color = 'k', linewidth=1.5) plt.autoscale(tight=True) plt.ylabel('Frequency (Hz)', fontsize = fontSize) plt.xlabel('Time (s)', fontsize = fontSize) plt.legend(('f0',)) xLim = ax.get_xlim() yLim = ax.get_ylim() ax.set_aspect((xLim[1]-xLim[0])/(2.0*(yLim[1]-yLim[0]))) if plot == 1: plt.autoscale(tight=True) plt.show() elif plot == 2: #you can save the plot too! fig.tight_layout() fig.savefig('f0_over_Spectrogram.png', dpi=150, bbox_inches='tight')
def computeAndPlotF0(inputFile = '../sms-tools/sounds/piano.wav'): """ Function to estimate fundamental frequency (f0) in an audio signal using TWM. Input: inputFile (string): wav file including the path """ window='hamming' M=2048 N=2048 H=256 f0et=5.0 t=-80 minf0=100 maxf0=300 fs, x = UF.wavread(inputFile) #reading inputFile w = get_window(window, M) #obtaining analysis window f0 = f0Detection(x, fs, w, N, H, t, minf0, maxf0, f0et) #estimating F0 ## Code for plotting the f0 contour on top of the spectrogram # frequency range to plot maxplotfreq = 500.0 fontSize = 16 plot = 1 fig = plt.figure() ax = fig.add_subplot(111) mX, pX = stft.stftAnal(x, w, N, H) #using same params as used for analysis mX = np.transpose(mX[:,:int(N*(maxplotfreq/fs))+1]) timeStamps = np.arange(mX.shape[1])*H/float(fs) binFreqs = np.arange(mX.shape[0])*fs/float(N) plt.pcolormesh(timeStamps, binFreqs, mX) plt.plot(timeStamps, f0, color = 'k', linewidth=1.5) plt.autoscale(tight=True) plt.ylabel('Frequency (Hz)', fontsize = fontSize) plt.xlabel('Time (s)', fontsize = fontSize) plt.legend(('f0',)) xLim = ax.get_xlim() yLim = ax.get_ylim() ax.set_aspect((xLim[1]-xLim[0])/(2.0*(yLim[1]-yLim[0]))) if plot == 1: plt.autoscale(tight=True) plt.show() elif plot == 2: #you can save the plot too! fig.tight_layout() fig.savefig('f0_over_Spectrogram.png', dpi=150, bbox_inches='tight')
def computeEngEnv(inputFile, window, M, N, H): w = get_window(window, M) (fs, x) = UF.wavread(inputFile) mX, pX = stft.stftAnal(x, w, N, H) size = int(N / 2) - 1 freq = np.zeros(size) count = 0 for val in range(size): freq[val] = val * float(fs) / N # Low frequency: freq > 0 and freq < 3000 (np.where can only do one cond) high_freq = np.where((freq > 3000) & (freq < 10000)) engEnv = np.array([]) LFL = [] # Low frequency list # https://stackoverflow.com/questions/21887138/iterate-over-the-output-of-np-where low_freq = zip(*np.where((freq > 0) & (freq < 3000))) high_freq = zip(*np.where((freq > 3000) & (freq < 10000))) # Can do this or calculate k * fs / N # Need to convert because of tuples, finds bounds UB_low = max(low_freq)[0] LB_high = min(high_freq)[0] UB_high = max(high_freq)[0] # Get FFT size / 2 + 1 resize = int(N / 2) + 1 new_size = int(mX.size / resize) low = np.zeros(shape=(new_size, UB_low)) high = np.zeros(shape=(new_size, UB_high - LB_high + 1)) for i in range(new_size): low[i] = mX[i][1:LB_high] high[i] = mX[i][LB_high:UB_high + 1] # Compute energy (energy conversions using log and sum ** 2) low_energy = energy_computation(low) high_energy = energy_computation(high) # Change to right structure engEnvs = np.append([low_energy], [high_energy], axis=0) engEnvs = np.transpose(engEnvs) return engEnvs
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ ### your code here # read file, get fs and signal fs, x = UF.wavread(inputFile) # get window w = get_window(window, M) # stft mX, _ = stft.stftAnal(x, w, N, H) # convert to linear scale mX = 10**(mX / 20) num_frame = mX.shape[0] engEnv = np.zeros((num_frame, 2)) fre_in_each_bin = fs / N k0_low = 1 k1_low = int(3000 // fre_in_each_bin) k0_high = k1_low + 1 k1_high = int(10000 // fre_in_each_bin) for i in range(num_frame): # energy envelope in (0, 3000) engEnv[i, 0] = np.sum(np.square(mX[i, k0_low:k1_low])) # energy envelope in (3000, 10000) engEnv[i, 1] = np.sum(np.square(mX[i, k0_high:k1_high])) # convert to db engEnv = 10 * np.log10(engEnv) return engEnv
def find_chirp_end_ms_odf(input_path): def dB2energydB(mdB): m = 10 ** (mdB / 20.) energy_ = m ** 2. #m = 10 * np.log10(m.sum()) energy_ = 10 * np.log10(np.sum(energy_)) return energy_ (fs, x) = UF.wavread(input_path) w = get_window(window, M) xmX, xpX = stft.stftAnal(x, w, N, H) numFrames = int(xmX[:,0].size) #Get number of frames (time slices) binFreq = np.arange(N/2+1)*float(fs)/N #Creating array of bin frequencies (positive side only) highBandIdx3000 = np.where(binFreq > 1000)[0][0] highBandIdx10000 = np.where(binFreq < 1002)[0][-1] # calculate energy per band engEnv = np.zeros([numFrames]) for idx_frame in range(numFrames): engEnv[idx_frame] = dB2energydB(xmX[idx_frame, highBandIdx3000:highBandIdx10000+1]) # plt.figure(1, figsize=(9.5, 6)) # # plt.subplot(211) # numFrames = int(xmX[:,0].size) # frmTime = H*np.arange(numFrames)/float(fs) # binFreq = np.arange(N/2+1)*float(fs)/N # plt.pcolormesh(frmTime, binFreq, np.transpose(xmX)) # plt.title('mX (piano.wav), M=1001, N=1024, H=256') # plt.autoscale(tight=True) # # plt.subplot(212) # numFrames = int(xmX[:,0].size) # frmTime = H*np.arange(numFrames)/float(fs) # binFreq = np.arange(N/2+1)*float(fs)/N # #plt.pcolormesh(frmTime, binFreq, np.diff(np.transpose(xpX),axis=0)) # #plt.plot(odf[:,0]) # plt.plot(abs(odf[:,1])) # plt.title('ODF adsfsf') # plt.autoscale(tight=True) # # plt.tight_layout() # plt.savefig('spectrogram.png') # plt.show() maxIndex = np.argmax(engEnv) timePercent = maxIndex * 1.0 / engEnv.size audioLength = x.size / fs end_of_chirp = audioLength * timePercent return end_of_chirp * 1000
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ ### your code here #get wav file and sampling rate from input File (fs, x) = UF.wavread(inputFile) #get window for STFT w = get_window(window, M, False) #Compute magnitude spectra xmX, _ = stft.stftAnal(x, w, N, H) #Convert the magnitude spectra from dB to linear scale xmX = 10 ** (xmX/20) #Find the border between log and high freq. lowFreq_threshold = 3000 highFreq_threshold = 10000 lowFreq_bin = lowFreq_threshold * N / fs highFreq_bin = highFreq_threshold * N / fs #compute the energy envelops #initialize the Enegey evenlope engEnvLow = np.array([]) engEnvHigh = np.array([]) #interate through ever frame to calculate the respected energy band, exclude the lower boundary of each #band. Also convert to dB scale for mX in xmX: engEnvLow = np.append(engEnvLow, 10 * np.log10(sum(mX[1:lowFreq_bin+1] ** 2))) engEnvHigh = np.append(engEnvHigh, 10* np.log10(sum(mX[lowFreq_bin+1:highFreq_bin+1] ** 2))) engEnv = np.array([engEnvLow, engEnvHigh]) return np.transpose(engEnv)
def mainlobeTracker(inputFile = '../sms-tools/sounds/sines-440-602-hRange.wav'): """ Input: inputFile (string): wav file including the path Output: window (string): The window type used for analysis t (float) = peak picking threshold (negative dB) tStamps (numpy array) = A Kx1 numpy array of time stamps at which the frequency components were estimated fTrackEst = A Kx2 numpy array of estimated frequency values, one row per time frame, one column per component fTrackTrue = A Kx2 numpy array of true frequency values, one row per time frame, one column per component """ # Analysis parameters: Modify values of the parameters marked XX window = 'blackman' # Window type t = -67 # threshold (negative dB) # window = blackman && t >= -67: Mean estimation error = [ 0.01060268 1.58192485] Hz # window = blackman harris && t >= -61: Mean estimation error = [ 0.01060268 1.58192485] Hz # ohers failed ### Go through the code below and understand it, do not modify anything ### M = 2047 # Window size N = 4096 # FFT Size H = 128 # Hop size in samples maxnSines = 2 minSineDur = 0.02 freqDevOffset = 10 freqDevSlope = 0.001 # read input sound fs, x = UF.wavread(inputFile) w = get_window(window, M) # Compute analysis window tStamps = genTimeStamps(x.size, M, fs, H) # Generate the tStamps to return # analyze the sound with the sinusoidal model fTrackEst, mTrackEst, pTrackEst = SM.sineModelAnal(x, fs, w, N, H, t, maxnSines, minSineDur, freqDevOffset, freqDevSlope) fTrackTrue = genTrueFreqTracks(tStamps) # Generate the true frequency tracks tailF = 20 # Compute mean estimation error. 20 frames at the beginning and end not used to compute error meanErr = np.mean(np.abs(fTrackTrue[tailF:-tailF,:] - fTrackEst[tailF:-tailF,:]),axis=0) print("Mean estimation error = " + str(meanErr) + ' Hz') # Print the error to terminal # Plot the estimated and true frequency tracks mX, pX = stft.stftAnal(x, w, N, H) maxplotfreq = 900.0 binFreq = fs * np.arange(N * maxplotfreq / fs) / N plt.pcolormesh(tStamps, binFreq, np.transpose(mX[:,:np.int(N * maxplotfreq / fs + 1)]), cmap='hot_r') plt.plot(tStamps,fTrackTrue, 'o-', color = 'c', linewidth=3.0) plt.plot(tStamps,fTrackEst, color = 'y', linewidth=2.0) plt.legend(('True f1', 'True f2', 'Estimated f1', 'Estimated f2')) plt.title('frequency detection: Window = ' + window + '& t = ' + str(t)) plt.xlabel('Time (s)') plt.ylabel('Frequency (Hz)') plt.autoscale(tight=True) return window, float(t), tStamps, fTrackEst, fTrackTrue # Output returned
def computeODF(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd integer value) N (integer): fft size (power of two, bigger or equal than than M) H (integer): hop size for the STFT computation Output: The function should return a numpy array with two columns, where the first column is the ODF computed on the low frequency band and the second column is the ODF computed on the high frequency band. ODF[:,0]: ODF computed in band 0 < f < 3000 Hz ODF[:,1]: ODF computed in band 3000 < f < 10000 Hz """ ### your code here fs, x = UF.wavread(inputFile) w = get_window(window, M) (mX, pX) = stft.stftAnal(x, fs, w, N, H) numFrames = int(mX[:,0].size) frmTime = H*np.arange(numFrames)/float(fs) binFreq = np.arange(N/2+1)*float(fs)/N cutoff1 = 3000 cutoff2 = 10000 cutoff_bucket1 = np.ceil(float(cutoff1) * N / fs) cutoff_bucket2 = np.ceil(float(cutoff2) * N / fs) low_band = mX[:,1:cutoff_bucket1] high_band = mX[:,cutoff_bucket1:cutoff_bucket2] E = np.zeros((numFrames, 2)) E[:,0] = by_frame_energy(low_band) E[:,1] = by_frame_energy(high_band) O = np.zeros((numFrames, 2)) O[1:,:] = E[1:,:] - E[:-1,:] # half wave rectification O[O<=0] = 0 # plot_odf(mX, fs, inputFile, M, N, H, O) return O
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ ### your code here fs,x = UF.wavread(inputFile) w = get_window(window, M) mX,pX = stft.stftAnal(x,w,N,H) r,c = np.shape(mX) mXLine = np.power(10,mX/20.0) #mX is 2 dimision (x: N y: Frequency) # get all the k frequencies bin_freqs = np.arange(N) * fs / float(N) # calculate the low-band frequency temp1 = np.where(bin_freqs > 0)[0] temp2 = np.where(bin_freqs < 3000)[0] band_low = np.intersect1d(temp1,temp2) # calculate the high-band frequency temp3 = np.where(bin_freqs > 3000)[0] temp4 = np.where(bin_freqs < 10000)[0] band_high = np.intersect1d(temp3,temp4) # initialize energy envelop engEnv = np.zeros((r,2)) low_band_energy = np.sum(mXLine[:,band_low]**2, axis = 1) low_band_energy = 10 * np.log10(low_band_energy) engEnv[:,0] = low_band_energy # calculate the high-band frequency high_band_energy = np.sum(mXLine[:,band_high]**2, axis = 1) high_band_energy = 10 * np.log10(high_band_energy) engEnv[:,1] = high_band_energy return engEnv
def create_plot(self): p = np.arange(self.audio.size) / float(self.rates) duration = p[-1] plt_len = duration * 0.5 plt.plot(np.arange(self.audio.size) / float(self.rates), self.audio) plt.axis([ 0, self.audio.size / float(self.rates), min(self.audio), max(self.audio) ]) plt.savefig("plots/time-domain.png") plt.close() N = 8192 #FFT M = 8192 #Analysis window size H = int(0.75 * M) #Overlap between window w = get_window("hamming", M) self.audio = np.float32(self.audio) / norm_fact[self.audio.dtype.name] maxplotfreq = self.rates / 8.82 mX, pX = stft.stftAnal(self.audio, self.rates, w, N, H) numFrames = int(mX[:, 0].size) frmTime = H * np.arange(numFrames) / float(self.rates) binFreq = self.rates * np.arange(N * maxplotfreq / self.rates) / N plt.figure(figsize=(plt_len, 1)) plt.pcolormesh( frmTime, binFreq, np.transpose(mX[:, :int(N * maxplotfreq / self.rates + 1)])) plt.axis("off") plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0) plt.savefig("plots/magnitude spectogram.png", dpi=300) plt.close() plt.plot(mX) plt.axis("off") plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0) plt.savefig("plots/magnitude plot.png") plt.close() global mx mx = mX
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ ### your code here lowf_init = 0 lowf_end = 3000 highf_init = 3000 highf_end = 10000 w = get_window(window, M) fs, x = UF.wavread(inputFile) lowb_init = lowf_init * N / fs lowb_end = lowf_end * N / fs highb_init = highf_init * N / fs highb_end = highf_end * N / fs xmX, pmX = stft.stftAnal(x, fs, w, N, H) xmX_linear = 10**(xmX / 20) result_low = 10 * np.log10 ( np.sum( abs( xmX_linear[:, 1 : lowb_end] )**2, 1 ) ) result_high = 10 * np.log10 ( np.sum( abs( xmX_linear[:, highb_init + 1 : highb_end] )**2, 1 ) ) frames = result_low.shape[0] result = np.array([result_low[0], result_high[0]]) for i in range(1, frames): temp = np.array([result_low[i], result_high[i]]) result = np.vstack( (result, temp) ) return result
def chirpTracker(inputFile='../sms-tools/sounds/chirp-150-190-linear.wav'): """ Input: inputFile (string) = wav file including the path Output: M (int) = Window length H (int) = hop size in samples tStamps (numpy array) = A Kx1 numpy array of time stamps at which the frequency components were estimated fTrackEst (numpy array) = A Kx2 numpy array of estimated frequency values, one row per time frame, one column per component fTrackTrue (numpy array) = A Kx2 numpy array of true frequency values, one row per time frame, one column per component K is the number of frames """ # Analysis parameters: Modify values of the parameters marked XX M = 3298 # Window size in samples ### Go through the code below and understand it, do not modify anything ### H = 128 # Hop size in samples N = int(pow(2, np.ceil(np.log2(M)))) # FFT Size, power of 2 larger than M t = -80.0 # threshold window = 'blackman' # Window type maxnSines = 2 # Maximum number of sinusoids at any time frame minSineDur = 0.0 # minimum duration set to zero to not do tracking freqDevOffset = 30 # minimum frequency deviation at 0Hz freqDevSlope = 0.001 # slope increase of minimum frequency deviation fs, x = UF.wavread(inputFile) # read input sound w = get_window(window, M) # Compute analysis window tStamps = genTimeStamps(x.size, M, fs, H) # Generate the tStamps to return # analyze the sound with the sinusoidal model fTrackEst, mTrackEst, pTrackEst = SM.sineModelAnal(x, fs, w, N, H, t, maxnSines, minSineDur, freqDevOffset, freqDevSlope) fTrackTrue = genTrueFreqTracks(tStamps) # Generate the true frequency tracks tailF = 20 # Compute mean estimation error. 20 frames at the beginning and end not used to compute error meanErr = np.mean(np.abs(fTrackTrue[tailF:-tailF,:] - fTrackEst[tailF:-tailF,:]),axis=0) print("Mean estimation error = " + str(meanErr) + ' Hz') # Print the error to terminal # Plot the estimated and true frequency tracks mX, pX = stft.stftAnal(x, w, N, H) # stft from anal maxplotfreq = 1500.0 binFreq = fs*np.arange(N*maxplotfreq/fs)/N plt.pcolormesh(tStamps, binFreq, np.transpose(mX[:,:int(N * maxplotfreq / fs + 1)]),cmap = 'hot_r') plt.plot(tStamps,fTrackTrue, 'o-', color = 'c', linewidth=3.0) plt.plot(tStamps,fTrackEst, color = 'y', linewidth=2.0) plt.legend(('True f1', 'True f2', 'Estimated f1', 'Estimated f2')) plt.title('True and estimated frequency, windowsize = ' + str(M)) plt.xlabel('Time (s)') plt.ylabel('Frequency (Hz)') plt.autoscale(tight=True) plt.show() return M, H, tStamps, fTrackEst, fTrackTrue # Output returned
def spec_calc(audio_inp, params): """ Calculates the framewise cepstral coefficients for the true envelope of the audio file. Parameters ---------- audio_inp : np.array Numpy array containing the audio signal, in the time domain params : dict Parameter dictionary for the sine model) containing the following keys - fs : Sampling rate of the audio - W : Window size(number of frames) - N : FFT size(multiple of 2) - H : Hop size - t : Threshold for sinusoidal detection in dB - maxnSines : Number of sinusoids to detect factor : float Shift factor for the pitch. New pitch = f * (old pitch) choice : 0,1,2 If 0, simply shifts the pitch without amplitude interpolation If 1, performs amplitude interpolation framewise to preserve timbre If 2, uses the True envelope of the amplitude spectrum to sample the points from choice_recon : 0 or 1 If 0, returns only the sinusoidal reconstruction If 1, adds the original residue as well to the sinusoidal f0 : Hz The fundamental frequency of the note Returns ------- audio_transformed : np.array Returns the transformed signal in the time domain """ fs = params['fs'] W = params['W'] N = params['N'] H = params['H'] t = params['t'] w = windows.hann(W) # Compute the STFT xmX, xpX = stftAnal(x=audio_inp, w=w, N=N, H=H) # xmX = stft_for_reconstruction(x = audio_inp, fft_size = N, hopsamp = H) # Remove the dB normalization done in the above function xmX = xmX / 20 return xmX, xpX
def computeODF(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd integer value) N (integer): fft size (power of two, bigger or equal than than M) H (integer): hop size for the STFT computation Output: The function should return a numpy array with two columns, where the first column is the ODF computed on the low frequency band and the second column is the ODF computed on the high frequency band. ODF[:,0]: ODF computed in band 0 < f < 3000 Hz ODF[:,1]: ODF computed in band 3000 < f < 10000 Hz """ ### your code here fs, s = wavread(FileName) w = get_window(window, M) x, xp = stft.stftAnal(s, w, N, H) x = np.asarray(x) len3000 = int(3000 * N / fs) len10000 = int(10000 * N / fs) Eb3 = np.zeros(len(x[0])) Eb10 = np.zeros(len(x[0])) j = 0 #energi band 3000 for i in x: if j < len(x[0]): p = i[:len3000] Eb3[j] = sum(abs(p)**2) j = j + 1 #energy band 10000 j = 0 for i in x: if j < len(x[0]): p = i[len3000:len10000] Eb10[j] = sum(abs(p)**2) j = j + 1 odf3 = np.diff(10 * np.log10(Eb3)) odf10 = np.diff(10 * np.log10(Eb10)) k = 0 if odf3[k] < 0: odf3[k] = 0 if odf10[k] < 0: odf10[k] = 0 return np.column_stack(odf3, odf10)
def mainlobeTracker(inputFile="../../sounds/sines-440-602-hRange.wav"): """ Input: inputFile (string): wav file including the path Output: window (string): The window type used for analysis t (float) = peak picking threshold (negative dB) tStamps (numpy array) = A Kx1 numpy array of time stamps at which the frequency components were estimated fTrackEst = A Kx2 numpy array of estimated frequency values, one row per time frame, one column per component fTrackTrue = A Kx2 numpy array of true frequency values, one row per time frame, one column per component """ # Analysis parameters: Modify values of the parameters marked XX window = "blackmanharris" # Window type t = -93.0 # threshold (negative dB) ### Go through the code below and understand it, do not modify anything ### M = 2047 # Window size N = 4096 # FFT Size H = 128 # Hop size in samples maxnSines = 2 minSineDur = 0.02 freqDevOffset = 10 freqDevSlope = 0.001 # read input sound fs, x = UF.wavread(inputFile) w = get_window(window, M) # Compute analysis window tStamps = genTimeStamps(x.size, M, fs, H) # Generate the tStamps to return # analyze the sound with the sinusoidal model fTrackEst, mTrackEst, pTrackEst = SM.sineModelAnal( x, fs, w, N, H, t, maxnSines, minSineDur, freqDevOffset, freqDevSlope ) fTrackTrue = genTrueFreqTracks(tStamps) # Generate the true frequency tracks tailF = 10 # Compute mean estimation error. 50 frames at the beginning and end not used to compute error meanErr = np.mean(np.abs(fTrackTrue[tailF:-tailF, :] - fTrackEst[tailF:-tailF, :]), axis=0) print "Mean estimation error = " + str(meanErr) + " Hz" # Print the error to terminal # Plot the estimated and true frequency tracks mX, pX = stft.stftAnal(x, fs, w, N, H) maxplotfreq = 900.0 binFreq = fs * np.arange(N * maxplotfreq / fs) / N plt.pcolormesh(tStamps, binFreq, np.transpose(mX[:, : N * maxplotfreq / fs + 1]), cmap="hot_r") plt.plot(tStamps, fTrackTrue, "o-", color="c", linewidth=3.0) plt.plot(tStamps, fTrackEst, color="y", linewidth=2.0) plt.legend(("True f1", "True f2", "Estimated f1", "Estimated f2")) plt.xlabel("Time (s)") plt.ylabel("Frequency (Hz)") plt.autoscale(tight=True) return window, t, tStamps, fTrackEst, fTrackTrue # Output returned
def main(): inputFile = "../../sounds/flute-A4.wav" window = "hamming" M = 801 N = 1024 H = 400 fs, x = UF.wavread(inputFile) w = get_window(window, M) mX, pX = STFT.stftAnal(x, w, N, H) plt.pcolormesh(np.transpose(mX)) return locals()
def computeODF(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd integer value) N (integer): fft size (power of two, bigger or equal than than M) H (integer): hop size for the STFT computation Output: The function should return a numpy array with two columns, where the first column is the ODF computed on the low frequency band and the second column is the ODF computed on the high frequency band. ODF[:,0]: ODF computed in band 0 < f < 3000 Hz ODF[:,1]: ODF computed in band 3000 < f < 10000 Hz """ ### your code here fs,x = UF.wavread(inputFile) w = get_window(window,M) mX,pX = stft.stftAnal(x,w,N,H) mX = pow(10,mX/20.) num_frames = len(mX) band_energy = np.zeros((len(mX),2)) for frm_idx in range(num_frames): frm = mX[frm_idx] for k in range(len(frm)): cur_f = k*44100/N if cur_f > 0 and cur_f < 3000: band_energy[frm_idx,0] += (frm[k]*frm[k]) elif cur_f > 3000 and cur_f < 10000: band_energy[frm_idx,1] += (frm[k]*frm[k]) band_energy = 10.0*np.log10(band_energy) odf = np.zeros((num_frames,2)) for frm_idx in range(1,num_frames): odf[frm_idx,0] = band_energy[frm_idx,0]-band_energy[frm_idx-1,0] odf[frm_idx,0] = 0 if odf[frm_idx,0] < 0 else odf[frm_idx,0] odf[frm_idx,1] = band_energy[frm_idx,1]-band_energy[frm_idx-1,1] odf[frm_idx,1] = 0 if odf[frm_idx,1] < 0 else odf[frm_idx,1] return odf
def main(inputFile='../../sounds/bendir.wav', window='hamming', M=2001, N=2048, t=-80, minSineDur=0.02, maxnSines=150, freqDevOffset=10, freqDevSlope=0.001): """ inputFile: input sound file (monophonic with sampling rate of 44100) window: analysis window type (rectangular, hanning, hamming, blackman, blackmanharris) M: analysis window size N: fft size (power of two, bigger or equal than M) t: magnitude threshold of spectral peaks minSineDur: minimum duration of sinusoidal tracks maxnSines: maximum number of parallel sinusoids freqDevOffset: frequency deviation allowed in the sinusoids from frame to frame at frequency 0 freqDevSlope: slope of the frequency deviation, higher frequencies have bigger deviation """ # size of fft used in synthesis Ns = 512 # hop size (has to be 1/4 of Ns) H = 128 # read input sound (fs, x) = UF.wavread(inputFile) # compute analysis window w = get_window(window, M) # perform sinusoidal plus residual analysis tfreq, tmag, tphase, xr = SPR.sprModelAnal(x, fs, w, N, H, t, minSineDur, maxnSines, freqDevOffset, freqDevSlope) # compute spectrogram of residual mXr, pXr = STFT.stftAnal(xr, fs, w, N, H) # sum sinusoids and residual y, ys = SPR.sprModelSynth(tfreq, tmag, tphase, xr, Ns, H, fs) # output sound file (monophonic with sampling rate of 44100) outputFileSines = 'output_sounds/' + os.path.basename(inputFile)[:-4] + '_sprModel_sines.wav' outputFileResidual = 'output_sounds/' + os.path.basename(inputFile)[:-4] + '_sprModel_residual.wav' outputFile = 'output_sounds/' + os.path.basename(inputFile)[:-4] + '_sprModel.wav' # write sounds files for sinusoidal, residual, and the sum UF.wavwrite(ys, fs, outputFileSines) UF.wavwrite(xr, fs, outputFileResidual) UF.wavwrite(y, fs, outputFile) return x, fs, mXr, tfreq, y
def main(inputFile='../../sounds/sax-phrase-short.wav', window='blackman', M=601, N=1024, t=-100, minSineDur=0.1, nH=100, minf0=350, maxf0=700, f0et=5, harmDevSlope=0.01): """ Perform analysis/synthesis using the harmonic plus residual model inputFile: input sound file (monophonic with sampling rate of 44100) window: analysis window type (rectangular, hanning, hamming, blackman, blackmanharris) M: analysis window size; N: fft size (power of two, bigger or equal than M) t: magnitude threshold of spectral peaks; minSineDur: minimum duration of sinusoidal tracks nH: maximum number of harmonics; minf0: minimum fundamental frequency in sound maxf0: maximum fundamental frequency in sound; f0et: maximum error accepted in f0 detection algorithm harmDevSlope: allowed deviation of harmonic tracks, higher harmonics have higher allowed deviation """ # size of fft used in synthesis Ns = 512 # hop size (has to be 1/4 of Ns) H = 128 # read input sound (fs, x) = UF.wavread(inputFile) # compute analysis window w = get_window(window, M) # find harmonics and residual hfreq, hmag, hphase, xr = HPR.hprModelAnal(x, fs, w, N, H, t, minSineDur, nH, minf0, maxf0, f0et, harmDevSlope) # compute spectrogram of residual mXr, pXr = STFT.stftAnal(xr, fs, w, N, H) # synthesize hpr model y, yh = HPR.hprModelSynth(hfreq, hmag, hphase, xr, Ns, H, fs) # output sound file (monophonic with sampling rate of 44100) outputFileSines = 'output_sounds/' + os.path.basename(inputFile)[:-4] + '_hprModel_sines.wav' outputFileResidual = 'output_sounds/' + os.path.basename(inputFile)[:-4] + '_hprModel_residual.wav' outputFile = 'output_sounds/' + os.path.basename(inputFile)[:-4] + '_hprModel.wav' # write sounds files for harmonics, residual, and the sum UF.wavwrite(yh, fs, outputFileSines) UF.wavwrite(xr, fs, outputFileResidual) UF.wavwrite(y, fs, outputFile) return x, fs, mXr,hfreq, y
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ (fs, x) = UF.wavread(inputFile) w = get_window(window, M) xmX, xpX = stft.stftAnal(x, fs, w, N, H) return np.array(map((lambda mX: frameEnergies(mX, fs, N)), xmX))
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ ### your code here w = get_window(window, M) # get the window (fs,x) = UF.wavread(inputFile) lowfreq = 3000.0 highfreq = 10000.0 array_size = int( math.ceil (float(x.size) / H ) )#H = 128 k1 = math.ceil ( lowfreq / ( float(fs) / N ) ) k2 = math.ceil ( highfreq / ( float(fs) / N ) ) energysignal = 0 energysignal2 = 0 engEnv = np.zeros( ( array_size, 2 ) ) xmX, xpX = stft.stftAnal( x, fs, w, N, H ) for j in range ( 0, array_size ) : xmXTemp = xmX[j] xmXTemp = np.power( 10, ( xmXTemp / 20.0 ) ) energysignal = 0.0 energysignal2 = 0.0 for i in range( 1, x.size ) : if ( i < k1 ) : energysignal += np.square( xmXTemp[i] ) if ( i >= k1 and i < k2 ) : energysignal2 += np.square( xmXTemp[i] ) energysignal = 10 * np.log10( energysignal ) energysignal2 = 10 * np.log10( energysignal2 ) engEnv[j][0] = energysignal engEnv[j][1] = energysignal2 return engEnv
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ ### your code here fs, x = UF.wavread(inputFile) w = get_window(window, M) (mX, pX) = stft.stftAnal(x, fs, w, N, H) numFrames = int(mX[:,0].size) frmTime = H*np.arange(numFrames)/float(fs) binFreq = np.arange(N/2+1)*float(fs)/N cutoff1 = 3000 cutoff2 = 10000 cutoff_bucket1 = np.ceil(float(cutoff1) * N / fs) cutoff_bucket2 = np.ceil(float(cutoff2) * N / fs) low_band = mX[:,1:cutoff_bucket1] high_band = mX[:,cutoff_bucket1:cutoff_bucket2] E = np.zeros((numFrames, 2)) E[:,0] = by_frame_energy(low_band) E[:,1] = by_frame_energy(high_band) #plot_energies(mX, fs, inputFile, M, N, H, E) return E
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ ### your code here fs, x = UF.wavread(inputFile) w = get_window(window, M) xmX,xpX = stft.stftAnal(x,fs,w,N,H) ldx0 = 1 ldx1 = ((3000*N)/fs) + 1 hdx0 = ldx1 hdx1 = ((10000*N)/fs) + 1 sz = np.size(xmX[:,0]) low_band = np.zeros(sz) high_band = np.zeros(sz) for i in np.arange(sz): #tmp = np.power(10,xmX[i,1:278]/20.0) tmp = np.power(10,xmX[i,ldx0:ldx1]/20.0) tmp[tmp < eps] = eps low_band[i] = 10 * np.log10(np.dot(tmp,tmp)) #low_band[i] = 10.0 * np.log10(np.sum(np.square(np.power(10, (xmX[i, 1: 140] / 20.0))))) #tmp1 = np.power(10,xmX[i,279:928]/20.0) tmp1 = np.power(10,xmX[i,hdx0:hdx1]/20.0) tmp1[tmp1 < eps] = eps high_band[i] = 10 * np.log10(np.dot(tmp1,tmp1)) return np.transpose(np.array([low_band,high_band]))
def computeODF(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd integer value) N (integer): fft size (power of two, such that N > M) H (integer): hop size for the STFT computation Output: The function should return a numpy array with two columns, where the first column is the ODF computed on the low frequency band and the second column is the ODF computed on the high frequency band. ODF[:,0]: ODF computed in band 0 < f < 3000 Hz ODF[:,1]: ODF computed in band 3000 < f < 10000 Hz""" fs, x = UF.wavread(inputFile) w = get_window(window, M) if N < M is True: raise ValueError("'N' should be greather than 'M'") if np.log2(N) % 1 != 0: raise ValueError("Input not power of 2") Xm, Xp = stft.stftAnal(x, w, N, H) Xm = 10 ** (Xm / 20) k = Xm.shape[0] #f = k × fs / N k_1 = np.array([3000, 10000]) * N / fs f = fs / 2.0 * np.arange(M) / float(M) print(f.shape) f_low = np.where(f>3000)[0][0] f_high = np.where(f>10000)[0][0] print(f_low, f_high, f_high - f_low, k_1) engEnv = np.zeros((k, 2)) engEnv[:,0] = 10 * np.log10(np.sum(np.abs(Xm[:,:f_low]) ** 2, axis=1)) engEnv[:,1] = 10 * np.log10(np.sum(np.abs(Xm[:,f_low+1:f_high]) ** 2, axis=1)) engEnv = np.vstack((np.zeros((1,2)), engEnv)) # ODF = np.zeros((k,2)) ODF = engEnv[1:-1,:] - engEnv[:-2,:] ODF0 = np.where(ODF<0) ODF[ODF0[0],ODF0[1]] = 0 return(ODF)
def computeODF(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd integer value) N (integer): fft size (power of two, bigger or equal than than M) H (integer): hop size for the STFT computation Output: The function should return a numpy array with two columns, where the first column is the ODF computed on the low frequency band and the second column is the ODF computed on the high frequency band. ODF[:,0]: ODF computed in band 0 < f < 3000 Hz ODF[:,1]: ODF computed in band 3000 < f < 10000 Hz """ ### your code here def undoDB(x): return np.power(10, np.divide(x,20)) def energy(x, k1, k2): x2 = np.power(x[:,k1:k2], 2) return np.sum(x2, axis=1) fs, x = UF.wavread(inputFile) w = get_window(window, M) bin1 = int(np.ceil(3000*N/fs)) bin2 = int(np.ceil(10000*N/fs)) mX, pX = stft.stftAnal(x, fs, w, N, H) nrgEnv1 = 10*np.log10( energy(undoDB(mX), 0, bin1)) nrgEnv2 = 10*np.log10(energy(undoDB(mX), bin1, bin2)) engEnv = np.transpose(np.array([nrgEnv1, nrgEnv2])) O = engEnv[1:,:]-engEnv[0:-1] O[O<0]=0 return O """
def display(engEnv, inputFile, window, M, N, H): (fs, x) = UF.wavread(inputFile) w = get_window(window, M) xmX, _ = stft.stftAnal(x, fs, w, N, H) plt.figure(1, figsize=(9.5, 6)) plt.subplot(211) numFrames = int(xmX[:,0].size) frmTime = H*np.arange(numFrames)/float(fs) binFreq = np.arange(N/2 + 1)*float(fs)/N plt.pcolormesh(frmTime, binFreq, np.transpose(xmX)) plt.autoscale(tight=True) plt.subplot(212) plt.plot(frmTime, engEnv[:,0], 'b', label='Low') plt.plot(frmTime, engEnv[:,1], 'g', label='High') plt.legend() plt.tight_layout() plt.show()
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ ### your code here def undoDB(x): return np.power(10, np.divide(x,20)) def energy(x, k1, k2): x2 = np.power(x[:,k1:k2], 2) return np.sum(x2, axis=1) fs, x = UF.wavread(inputFile) w = get_window(window, M) bin1 = int(np.ceil(3000*N/fs)) bin2 = int(np.ceil(10000*N/fs)) mX, pX = stft.stftAnal(x, fs, w, N, H) nrgEnv1 = 10*np.log10( energy(undoDB(mX), 0, bin1)) nrgEnv2 = 10*np.log10(energy(undoDB(mX), bin1, bin2)) result = np.transpose(np.array([nrgEnv1, nrgEnv2])) return result """
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ fs, x = UF.wavread(inputFile) w = get_window(window, M) if N < M is True: raise ValueError("'N' should be greather than 'M'") if np.log2(N) % 1 != 0: raise ValueError("Input not power of 2") Xm, Xp = stft.stftAnal(x, w, N, H) Xm = 10 ** (Xm / 20) k = Xm.shape[0] #f = k × fs / N k_1 = np.array([3000, 10000]) * N / fs f = fs / 2.0 * np.arange(M) / float(M) print(f.shape) f_low = np.where(f>3000)[0][0] f_high = np.where(f>10000)[0][0] print(f_low, f_high, f_high - f_low, k_1) engEnv = np.zeros((k, 2)) engEnv[:,0] = 10 * np.log10(np.sum(np.abs(Xm[:,:f_low]) ** 2, axis=1)) engEnv[:,1] = 10 * np.log10(np.sum(np.abs(Xm[:,f_low+1:f_high]) ** 2, axis=1)) return(engEnv)
def computeSNR(inputFile, window, M, N, H): """ Input: inputFile (string): wav file name including the path window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window length (odd positive integer) N (integer): fft size (power of two, > M) H (integer): hop size for the stft computation Output: The function should return a python tuple of both the SNR values (SNR1, SNR2) SNR1 and SNR2 are floats. """ ## your code here fs, x = UF.wavread(inputFile) #x1 = x #x2 = x[M:-M] w = get_window(window, M) (mX, pX) = stft.stftAnal(x, fs, w, N, H) y = stft.stftSynth(mX, pX, M, H) noise = x - y[:x.size] return (snr(x, noise), snr(x[M:-M], noise[M:-M]))
def computeEngEnv(inputFile, window, M, N, H): """ Inputs: inputFile (string): input sound file (monophonic with sampling rate of 44100) window (string): analysis window type (choice of rectangular, triangular, hanning, hamming, blackman, blackmanharris) M (integer): analysis window size (odd positive integer) N (integer): FFT size (power of 2, such that N > M) H (integer): hop size for the stft computation Output: The function should return a numpy array engEnv with shape Kx2, K = Number of frames containing energy envelop of the signal in decibles (dB) scale engEnv[:,0]: Energy envelope in band 0 < f < 3000 Hz (in dB) engEnv[:,1]: Energy envelope in band 3000 < f < 10000 Hz (in dB) """ w = get_window(window, M) fs, x = UF.wavread(inputFile) mX, pX = stft.stftAnal(x, w, N, H) kmin, kmax = 1, int(np.ceil(3000.*N/fs)) # 0-3000Hz excluding 0 and 3000Hz l = mX.shape[0] e1 = np.zeros((1, l)) for i in range(l): e1[0,i] = 10. * np.log10( np.sum((10.**(mX[i,kmin:kmax]/20.))**2) ) kmin = kmax kmax = int(np.ceil(10000.*N/fs)) e2 = np.zeros((1, l)) for i in range(l): e2[0,i] = 10. * np.log10( np.sum((10.**(mX[i,kmin:kmax]/20.))**2) ) e = np.zeros((e1.shape[1],2)) e[:,0] = e1 e[:,1] = e2 return e