def test_reconstruct_sound(): fs, x = audio.read_wav(sound_path("sax-phrase-short.wav")) window_size, fft_size, hop_size = 4001, 4096, 2048 window = get_window('hamming', window_size) xtfreq, xtmag, xtphase = sine.from_audio( x, fs, window, fft_size, hop_size, t=-80, maxnSines=100, minSineDur=.01, freqDevOffset=20, freqDevSlope=0.01) x_reconstructed = sine.to_audio(xtfreq, xtmag, xtphase, fft_size, hop_size, fs) assert 138746 == len(x) expected_frame_count = int(math.ceil(float(len(x)) / hop_size)) assert expected_frame_count == len(xtfreq) assert expected_frame_count == len(xtmag) assert expected_frame_count == len(xtphase) assert xtfreq.shape[1] <= 100 # statistics of the model for regression testing without explicitly storing the whole data assert np.allclose(945.892990545, xtfreq.mean()) assert np.allclose(-30.3138495002, xtmag.mean()) assert np.allclose(1.34449391701, xtphase.mean()) # TODO: this is completely off, it should be equal to len(x)! assert 69 * 2048 == len(x_reconstructed) assert np.allclose(0.010812475879315771, rmse(x, x_reconstructed[:len(x)]))
def analysis(inputFile=demo_sound_path('mridangam.wav'), window='hamming', M=801, N=2048, t=-90, minSineDur=0.01, maxnSines=150, freqDevOffset=20, freqDevSlope=0.02, interactive=True, plotFile=False): """ Analyze a sound with the sine model inputFile: input sound file (monophonic with sampling rate of 44100) window: analysis window type (rectangular, hanning, hamming, blackman, blackmanharris) M: analysis window size; N: fft size (power of two, bigger or equal than M) t: magnitude threshold of spectral peaks; minSineDur: minimum duration of sinusoidal tracks maxnSines: maximum number of parallel sinusoids freqDevOffset: frequency deviation allowed in the sinusoids from frame to frame at frequency 0 freqDevSlope: slope of the frequency deviation, higher frequencies have bigger deviation returns inputFile: input file name; fs: sampling rate of input file, tfreq, tmag: sinusoidal frequencies and magnitudes """ # size of fft used in synthesis Ns = 512 # hop size (has to be 1/4 of Ns) H = 128 # read input sound (fs, x) = audio.read_wav(inputFile) # compute analysis window w = get_window(window, M) # compute the sine model of the whole sound tfreq, tmag, tphase = sine.from_audio(x, fs, w, N, H, t, maxnSines, minSineDur, freqDevOffset, freqDevSlope) # synthesize the sines without original phases y = sine.to_audio(tfreq, tmag, np.array([]), Ns, H, fs) # output sound file (monophonic with sampling rate of 44100) outputFile = 'output_sounds/' + strip_file(inputFile) + '_sineModel.wav' # write the sound resulting from the inverse stft audio.write_wav(y, fs, outputFile) # create figure to show plots plt.figure(figsize=(12, 9)) # frequency range to plot maxplotfreq = 5000.0 # plot the input sound plt.subplot(3, 1, 1) plt.plot(np.arange(x.size) / float(fs), x) plt.axis([0, x.size / float(fs), min(x), max(x)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('input sound: x') # plot the sinusoidal frequencies if (tfreq.shape[1] > 0): plt.subplot(3, 1, 2) tracks = np.copy(tfreq) tracks = tracks * np.less(tracks, maxplotfreq) tracks[tracks <= 0] = np.nan numFrames = int(tracks.shape[0]) frmTime = H * np.arange(numFrames) / float(fs) plt.plot(frmTime, tracks) plt.axis([0, x.size / float(fs), 0, maxplotfreq]) plt.title('frequencies of sinusoidal tracks') # plot the output sound plt.subplot(3, 1, 3) plt.plot(np.arange(y.size) / float(fs), y) plt.axis([0, y.size / float(fs), min(y), max(y)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('output sound: y') plt.tight_layout() if interactive: plt.show(block=False) if plotFile: plt.savefig('output_plots/%s_sine_transformation_analysis.png' % files.strip_file(inputFile)) return inputFile, fs, tfreq, tmag
from smst.models import sine, stft plt.figure(1, figsize=(9, 7)) plt.subplot(211) (fs, x) = audio.read_wav('../../../sounds/vibraphone-C6.wav') w = np.blackman(401) N = 512 H = 100 t = -100 minSineDur = .02 maxnSines = 150 freqDevOffset = 20 freqDevSlope = 0.01 mX, pX = stft.from_audio(x, w, N, H) tfreq, tmag, tphase = sine.from_audio(x, fs, w, N, H, t, maxnSines, minSineDur, freqDevOffset, freqDevSlope) maxplotfreq = 10000.0 maxplotbin = int(N * maxplotfreq / fs) numFrames = int(mX.shape[0]) frmTime = H * np.arange(numFrames) / float(fs) binFreq = np.arange(maxplotbin + 1) * float(fs) / N plt.pcolormesh(frmTime, binFreq, np.transpose(mX[:, :maxplotbin + 1])) plt.autoscale(tight=True) tracks = tfreq * np.less(tfreq, maxplotfreq) tracks[tracks <= 0] = np.nan plt.plot(frmTime, tracks, color='k', lw=1.5) plt.autoscale(tight=True) plt.title('mX + sine frequencies (vibraphone-C6.wav)')
from smst.utils import audio from smst.models import sine (fs, x) = audio.read_wav('../../../sounds/mridangam.wav') x1 = x[:int(1.49 * fs)] w = np.hamming(801) N = 2048 t = -90 minSineDur = .005 maxnSines = 150 freqDevOffset = 20 freqDevSlope = 0.02 Ns = 512 H = Ns / 4 sfreq, smag, sphase = sine.from_audio(x1, fs, w, N, H, t, maxnSines, minSineDur, freqDevOffset, freqDevSlope) timeScale = np.array([ .01, .0, .03, .03, .335, .8, .355, .82, .671, 1.0, .691, 1.02, .858, 1.1, .878, 1.12, 1.185, 1.8, 1.205, 1.82, 1.49, 2.0 ]) L = sfreq.shape[0] # number of input frames maxInTime = max(timeScale[::2]) # maximum value used as input times maxOutTime = max(timeScale[1::2]) # maximum value used in output times outL = int(L * maxOutTime / maxInTime) # number of output frames inFrames = L * timeScale[::2] / maxInTime # input time values in frames outFrames = outL * timeScale[1::2] / maxOutTime # output time values in frames timeScalingEnv = interp1d(outFrames, inFrames, fill_value=0) # interpolation function indexes = timeScalingEnv( np.arange(outL)) # generate frame indexes for the output ysfreq = sfreq[round(indexes[0]), :] # first output frame
mpl.use('Agg') import matplotlib.pyplot as plt import numpy as np from smst.utils import audio from smst.models import sine, stft (fs, x) = audio.read_wav('../../../sounds/flute-A4.wav') w = np.blackman(601) N = 1024 H = 150 t = -80 minSineDur = .1 maxnSines = 150 mX, pX = stft.from_audio(x, w, N, H) tfreq, tmag, tphase = sine.from_audio(x, fs, w, N, H, t, maxnSines, minSineDur) plt.figure(1, figsize=(9.5, 5)) maxplotfreq = 5000.0 maxplotbin = int(N * maxplotfreq / fs) numFrames = int(mX.shape[0]) frmTime = H * np.arange(numFrames) / float(fs) binFreq = np.arange(maxplotbin + 1) * float(fs) / N plt.pcolormesh(frmTime, binFreq, np.transpose(mX[:, :maxplotbin + 1])) plt.autoscale(tight=True) tracks = tfreq * np.less(tfreq, maxplotfreq) tracks[tracks <= 0] = np.nan plt.plot(frmTime, tracks, color='k', lw=1.5) plt.autoscale(tight=True) plt.title('mX + sinusoidal tracks (flute-A4.wav)')
def main(inputFile=demo_sound_path('bendir.wav'), window='hamming', M=2001, N=2048, t=-80, minSineDur=0.02, maxnSines=150, freqDevOffset=10, freqDevSlope=0.001, interactive=True, plotFile=False): """ Perform analysis/synthesis using the sinusoidal model inputFile: input sound file (monophonic with sampling rate of 44100) window: analysis window type (rectangular, hanning, hamming, blackman, blackmanharris) M: analysis window size; N: fft size (power of two, bigger or equal than M) t: magnitude threshold of spectral peaks; minSineDur: minimum duration of sinusoidal tracks maxnSines: maximum number of parallel sinusoids freqDevOffset: frequency deviation allowed in the sinusoids from frame to frame at frequency 0 freqDevSlope: slope of the frequency deviation, higher frequencies have bigger deviation """ # size of fft used in synthesis Ns = 512 # hop size (has to be 1/4 of Ns) H = 128 # read input sound fs, x = audio.read_wav(inputFile) # compute analysis window w = get_window(window, M) # analyze the sound with the sinusoidal model tfreq, tmag, tphase = sine.from_audio(x, fs, w, N, H, t, maxnSines, minSineDur, freqDevOffset, freqDevSlope) # synthesize the output sound from the sinusoidal representation y = sine.to_audio(tfreq, tmag, tphase, Ns, H, fs) # output sound file name outputFile = 'output_sounds/' + strip_file(inputFile) + '_sineModel.wav' # write the synthesized sound obtained from the sinusoidal synthesis audio.write_wav(y, fs, outputFile) # create figure to show plots plt.figure(figsize=(12, 9)) # frequency range to plot maxplotfreq = 5000.0 # plot the input sound plt.subplot(3, 1, 1) plt.plot(np.arange(x.size) / float(fs), x) plt.axis([0, x.size / float(fs), min(x), max(x)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('input sound: x') # plot the sinusoidal frequencies plt.subplot(3, 1, 2) if (tfreq.shape[1] > 0): numFrames = tfreq.shape[0] frmTime = H * np.arange(numFrames) / float(fs) tfreq[tfreq <= 0] = np.nan plt.plot(frmTime, tfreq) plt.axis([0, x.size / float(fs), 0, maxplotfreq]) plt.title('frequencies of sinusoidal tracks') # plot the output sound plt.subplot(3, 1, 3) plt.plot(np.arange(y.size) / float(fs), y) plt.axis([0, y.size / float(fs), min(y), max(y)]) plt.ylabel('amplitude') plt.xlabel('time (sec)') plt.title('output sound: y') plt.tight_layout() if interactive: plt.show() if plotFile: plt.savefig('output_plots/%s_sine_model.png' % files.strip_file(inputFile))