def reconstruct(l_specs, t_specs, neighbs, k, winsize, n_iter_gl): """ reconstruct by taking the median of the Knn and GL """ Y_hat = np.zeros_like(t_specs) T = neighbs.shape[0] for t in range(T): Y_hat[t, :] = np.median(l_specs[neighbs[t, :k], :], 0) init_vec = np.random.randn(128 * Y_hat.shape[0]) x_recon = transforms.gl_recons(Y_hat.T, init_vec, n_iter_gl, winsize, 128, display=False) return x_recon
t_seg_starts, t_seg_duration, l_segments_all, l_feats, '', '.au', 22050, dotime_stretch=False, max_synth_idx=nb_max_seg, marge=marge, normalize=True) # stft magspecs.append(np.abs(stft.stft(sigout,512,128)[0,:,:])) magspecarr = np.array(magspecs) Lmin = min(Lmin, magspecarr.shape[2]) print "Add-Max" max_magspec = np.max(magspecarr, 0) init_vec = np.random.randn(128*Lmin) x_recon = gl_recons(max_magspec[:,:Lmin], init_vec, 20, 512, 128, display=False) sig_add_max = Signal(x_recon, fs, normalize=True) #sig_add_max.write(op.join(recons_audio_path, '_add_max_%s_P%d.wav'%(feat_comb,P))) save_fig_audio(sig_add_max, recons_audio_path, recons_fig_path, "add_max_%s_P%d"%(feat_comb,P)) print "KL value %2.2f"%KLspec(orig_spec[:,:Lmin], max_magspec[:,:Lmin]) print "Add-Mean" mean_magspec = np.mean(magspecarr, 0) init_vec = np.random.randn(128*Lmin) x_recon = gl_recons(mean_magspec[:,:Lmin], init_vec, 20, 512, 128, display=False) sig_add_mean = Signal(x_recon, fs, normalize=True) save_fig_audio(sig_add_mean, recons_audio_path,
1, (wsize / 2) + 1).repeat(len(indexes), 0) plt.figure() plt.subplot(131) plt.imshow(np.log(np.abs(learn_specs.T)), origin='lower') plt.subplot(132) plt.imshow(np.log(averaged_specs.T), origin='lower') plt.subplot(133) plt.imshow(np.log(reconstructed_averaged_specs.T), origin='lower') plt.show() # time for resynthesis init_vec = np.random.randn(original.data.shape[0]) rec_method1 = transforms.gl_recons(reconstructed_averaged_specs.T, init_vec, 10, wsize, tstep, display=False) rec_sig = Signal(rec_method1, original.fs, mono=True, normalize=True) # CONCLUSION: WE HAVE KEPT ONLY ONE SPECTRUM PER "NOTE" : DIGITALIZED SOUND # METHOD 2: use the original waveform to resynthesize # We have the segmentation and for each segment we have a waveform # we should directly use the "potentially time-extended" original waveform # to resynthesize ! #waveform_list = [] #for i in range(1,segments.shape[0]): # waveform_list.append(original.data) # This is a little too easy in this context, we should try it after the nearest neighbors search # METHOD 3 for each feature bag, we have a M x F spectogram (or magspec)
def expe_1_synth_from_same_sample(): input_dir = '/sons/rwc/Learn/' output_dir = '/sons/rwc/Learn/hdf5/' audiofile = input_dir + 'rwc-g-m01_1.wav' h5file = output_dir + 'rwc-g-m01_1.h5' # load the Echo Nest features h5 = hdf5_getters.open_h5_file_read(h5file) timbre = hdf5_getters.get_segments_timbre(h5) loudness_start = hdf5_getters.get_segments_loudness_start(h5) loudness_max = hdf5_getters.get_segments_loudness_max(h5) loudness_max_time = hdf5_getters.get_segments_loudness_max_time(h5) C = hdf5_getters.get_segments_pitches(h5) segments_all = hdf5_getters.get_segments_start(h5) learn_feats_all = np.hstack((timbre, loudness_start.reshape((loudness_start.shape[0],1)), C)) # Ok That was the best possible case, now let us try to find the nearest neighbors, # get the segment back and resynthesize! learn_duration = 200 # in seconds test_start = 200 test_duration = 5 # Get learning data learning = Signal(audiofile, mono=True) learning.crop(0, learn_duration*learning.fs) wsize = 1024 tstep = 512 # Get the magnitude spectrum for the given audio file learn_specs = features.get_stft(learning.data, wsize, tstep) learn_specs = learn_specs.T max_l_seg_idx = np.where(segments_all < learn_duration)[0][-1] l_segments = segments_all[:max_l_seg_idx] l_segment_lengths = (l_segments[1:] - l_segments[0:-1])*learning.fs learn_feats = learn_feats_all[:max_l_seg_idx,:] # we must keep in mind for each segment index, the corresponding indices in the learn_spec mat l_seg_bounds = [] ref_time = np.arange(0., float(learning.length)/float(learning.fs), float(tstep)/float(learning.fs)) for segI in range(len(l_segments)-1): startIdx = np.where(ref_time > l_segments[segI])[0][0] endIdx = np.where(ref_time > l_segments[segI+1])[0][0] l_seg_bounds.append((startIdx,endIdx)) l_seg_bounds.append((endIdx, ref_time.shape[0])) # Get testing data testing = Signal(audiofile, mono=True) testing.crop(test_start*testing.fs, (test_start+test_duration)*learning.fs) # get the testing features min_t_seg_idx = np.where(segments_all < test_start)[0][-1] max_t_seg_idx = np.where(segments_all < test_start + test_duration)[0][-1] t_segments = segments_all[min_t_seg_idx:max_t_seg_idx] t_segment_lengths = (t_segments[1:] - t_segments[0:-1])*testing.fs test_feats = learn_feats_all[min_t_seg_idx:max_t_seg_idx,:] # find the nearest neighbors from sklearn.neighbors import NearestNeighbors neigh = NearestNeighbors(1) # fit on the learning data neigh.fit(learn_feats) neighb_segments_idx = neigh.kneighbors(test_feats, return_distance=False) # kneighs is a set of segment indices, we need to get the spectrogram back from the learning data # then fit the new segment lengths target_length = int(test_duration*testing.fs) neighb_segments = zip(neighb_segments_idx[:,0], t_segment_lengths.astype(int)) morphed_spectro = spec_morph(np.abs(learn_specs), target_length, neighb_segments, l_seg_bounds) # retrieve true stft for comparison test_specs = features.get_stft(testing.data, wsize, tstep) plt.figure() plt.subplot(121) plt.imshow(np.log(np.abs(test_specs)), origin='lower') plt.colorbar() plt.subplot(122) plt.imshow(np.log(morphed_spectro.T), origin='lower') plt.colorbar() plt.show() init_vec = np.random.randn(morphed_spectro.shape[0]*tstep) rec_method2 = transforms.gl_recons(morphed_spectro.T, init_vec, 10, wsize, tstep, display=False) rec_sig_2 = Signal(rec_method2, testing.fs, mono=True, normalize=True) rec_sig_2.write('/sons/tests/rec_sig2.wav')
orig_spec_name = 'origrray_%s_Trial%d_seed%d.npy'%(t_name,min_idx[0],rndseed) orig_spec = np.load(os.path.join(out_dir,orig_spec_name)) output_audio_path = '/home/manu/Documents/Articles/ISMIR2013/ListeningMSD/Audio/' output_fig_path = '/home/manu/Documents/Articles/ISMIR2013/ListeningMSD/Figures/' colormap = cm.jet format = (8,3) # also load the Dan Ellis's synthesized version # The Piano cross-synthesis and the Viterbi smoothed Musaicing? # resynthesize using the first N frames n_max_frames = 900 nb_gl_iter = 30 init_vec = np.random.randn(128*n_max_frames) x_recon_median = transforms.gl_recons(median_magspec[:,:n_max_frames], init_vec, nb_gl_iter, 512, 128, display=False) sig_median = Signal(x_recon_median, 22050,normalize=True) sig_median.write(os.path.join(output_audio_path, '%s_add_median.wav'%t_name)) plt.figure(figsize=format) sig_median.spectrogram(512, 128, order=1, log=True, cmap=colormap, cbar=False) plt.savefig(os.path.join(output_fig_path, '%s_add_median.png'%t_name)) init_vec = np.random.randn(128*n_max_frames) x_recon_orig = transforms.gl_recons(orig_spec[:,:n_max_frames], init_vec, nb_gl_iter, 512, 128, display=False) sig_orig= Signal(x_recon_orig, 22050,normalize=True) sig_orig.write(os.path.join(output_audio_path, '%s_original.wav'%t_name)) plt.figure(figsize=format) sig_orig.spectrogram(512, 128, order=1, log=True, cmap=colormap, cbar=False) plt.savefig(os.path.join(output_fig_path, '%s_original.png'%t_name))
print "Took ", time.time() - t , " secs" #plt.figure();plt.imshow(Ktest_dev); #plt.colorbar() #plt.show() # optionnal step: median filtering for smoothing the data: Y_hat = median_filter(Y_hat,(1,10)) #plt.figure() #plt.subplot(211) #plt.imshow(np.log(Y), # origin='lower') #plt.colorbar() #plt.title('Original') #plt.subplot(212) #plt.imshow(np.log(Y_hat), # origin='lower') #plt.colorbar() #plt.title('Estimation from Nadaraya-Watson') #plt.show() sig_orig = Signal(test_audiofilepath, normalize=True, mono=True) #init_vec = np.random.randn(step_size*Y_hat.shape[1]) init_vec = np.random.randn(sig_orig.length) x_recon = transforms.gl_recons(Y_hat, init_vec, 10, win_size, step_size, display=False) plt.show() sig_recon = Signal(x_recon, 32000, normalize=True) err = 10.0*np.log10(np.sum((sig_recon.data - sig_orig.data)**2)/np.sum((sig_orig.data**2)))
win_size = params['wintime'] * params['sr'] step_size = params['steptime'] * params['sr'] # sliding median filtering ? if l_medfilt > 1: estimated_spectrum = median_filter( estimated_spectrum_full + estimated_spectrum_harmo, (1, l_medfilt)) print "reconstruction" #init_vec = np.random.randn(step_size*Y_hat.shape[1]) init_vec = np.random.randn(step_size * estimated_spectrum.shape[1]) x_recon = transforms.gl_recons(estimated_spectrum, init_vec, nb_iter_gl, win_size, step_size, display=False) # Get the rythmic part by using all coefficients # res_array = regression.eval_knn( learn_feats[:,0:20], learn_magspecs, # test_feats[:,0:20] , # test_magspecs, ref_t_data, # nb_median, nb_iter_gl, # l_medfilt, params) # # # now get a harmonic candidate bu using only the chroma coefficients # res_array_harmo = regression.eval_knn( learn_feats[:,-48:], learn_magspecs, # test_feats[:,-48:] , # test_magspecs, ref_t_data, # nb_median, nb_iter_gl,
import os from feat_invert import regression, transforms, features sys.path.append('/home/manu/workspace/audio-sketch') sys.path.append('/home/manu/workspace/PyMP') sys.path.append('/home/manu/workspace/meeg_denoise') import stft # load the sinewave speech sinewave = Signal('/sons/sqam/vegaSWS.wav', mono=True) spectro = stft.stft(sinewave.data, wsize=1024, tstep=256)[0, :, :] init_vec = np.random.randn(sinewave.data.shape[0]) rec_gl_data = transforms.gl_recons(np.abs(spectro), init_vec, niter=20, wsize=1024, tstep=256) sig_rec = Signal(rec_gl_data, sinewave.fs, mono=True, normalize=True) sig_rec.write('/sons/sqam/vegaSWS_gl.wav') # ok it's working just fine' # now compare with reconstruction from original spectrogram original = Signal('/sons/sqam/vega.wav', mono=True) spectro = stft.stft(original.data, wsize=1024, tstep=256)[0, :, :] init_vec = np.random.randn(original.data.shape[0]) rec_gl_data = transforms.gl_recons(np.abs(spectro), init_vec, niter=20, wsize=1024,
knn.fit(learn_feats_all[:, Learnidxs]) distance, neighbs = knn.kneighbors(Feats[:, Learnidxs], n_neighbors=3, return_distance=True) # reconstruct Y_hat = np.zeros_like(MagSpectrums) T = neighbs.shape[0] for t in range(T): Y_hat[t, :] = np.median(learn_magspecs_all[neighbs[t, :], :], 0) init_vec = np.random.randn(128 * Y_hat.shape[0]) x_recon = transforms.gl_recons(Y_hat.T, init_vec, 50, wsize, 128, display=False) import sti orig_sig = Signal(learn_audiofilepath, mono=True, normalize=True) orig_sig.downsample(16000) sig = Signal(x_recon, 16000, normalize=True) score = sti.stiFromAudio(orig_sig.data, x_recon, 16000, calcref=False, downsample=None, name="unnamed")