def process_file(self, mel, sess): stat_file = h5py.File('./stats_yam.hdf5', mode='r') max_feat = stat_file["feats_maximus"][()] min_feat = stat_file["feats_minimus"][()] stat_file.close() mel = np.clip(mel, 0.0, 1.0) in_batches_mel, nchunks_in = utils.generate_overlapadd(mel) out_batches_mel = [] out_f0 = [] out_vuv = [] for in_batch_mel in in_batches_mel : # speaker = np.repeat(speaker_index, config.batch_size) speaker_index = config.singers.index('Nikolas') speaker_2 = np.repeat(speaker_index, config.batch_size) if config.use_speaker: feed_dict = {self.stft_placeholder: in_batch_mel, self.speaker_labels_1: speaker_2, self.is_train: False} else: feed_dict = {self.stft_placeholder: in_batch_mel, self.is_train: False} mel, f0, vuv = sess.run([self.output_stft, self.f0, self.vuv], feed_dict=feed_dict) out_batches_mel.append(mel) out_f0.append(f0) out_vuv.append(vuv) out_batches_mel = np.array(out_batches_mel) out_batches_mel = utils.overlapadd(out_batches_mel,nchunks_in) out_f0 = utils.overlapadd(np.array(out_f0), nchunks_in) out_vuv = utils.overlapadd(np.array(out_vuv), nchunks_in) if config.f0_mode == "cont": out_f0 = out_f0*(max_feat[-2] - min_feat[-2]) + min_feat[-2] out_batches_mel = out_batches_mel*(max_feat[:-2] - min_feat[:-2]) + min_feat[:-2] out_vuv = np.round(out_vuv) return out_batches_mel, out_f0, out_vuv
def process_file(self, mix_stft, sess): stat_file = h5py.File(config.stat_dir+'stats.hdf5', mode='r') max_voc = np.array(stat_file["voc_stft_maximus"]) min_voc = np.array(stat_file["voc_stft_minimus"]) max_feat = np.array(stat_file["feats_maximus"]) min_feat = np.array(stat_file["feats_minimus"]) stat_file.close() in_batches_stft, nchunks_in = utils.generate_overlapadd(mix_stft) in_batches_stft = in_batches_stft/max_voc out_batches_feats = [] for in_batch_stft in in_batches_stft : feed_dict = {self.input_placeholder: in_batch_stft} harm, ap, f0, vuv = sess.run([self.harm, self.ap, self.f0, self.vuv], feed_dict=feed_dict) val_feats = np.concatenate((harm, ap, f0, vuv), axis=-1) out_batches_feats.append(val_feats) out_batches_feats = np.array(out_batches_feats) out_feats = utils.overlapadd(out_batches_feats,nchunks_in) out_feats[:,-1] = np.round(out_feats[:,-1]) out_feats = out_feats*(max_feat-min_feat)+min_feat return out_feats
def test_wav_file(self, file_name, save_path): """ Function to extract multi pitch from wav file. """ sess = tf.Session() self.load_model(sess, log_dir=config.log_dir) in_batches_hcqt, nchunks_in, max_len = self.read_input_wav_file( file_name) out_batches_atb = [] for in_batch_hcqt in in_batches_hcqt: feed_dict = { self.input_placeholder: in_batch_hcqt, self.is_train: False } out_atb = sess.run(self.outputs, feed_dict=feed_dict) out_batches_atb.append(out_atb) out_batches_atb = np.array(out_batches_atb) out_batches_atb = utils.overlapadd( out_batches_atb.reshape(out_batches_atb.shape[0], config.batch_size, config.max_phr_len, -1), nchunks_in) out_batches_atb = out_batches_atb[:max_len] time_1, ori_freq = utils.process_output(out_batches_atb) utils.save_multif0_output(time_1, ori_freq, save_path)
def process_file(self,f0_nor, pho_target, singer_index, sess): stat_file = h5py.File(config.stat_dir+'stats.hdf5', mode='r') max_feat = np.array(stat_file["feats_maximus"]) min_feat = np.array(stat_file["feats_minimus"]) stat_file.close() in_batches_f0, nchunks_in = utils.generate_overlapadd(np.expand_dims(f0_nor, -1)) in_batches_pho, nchunks_in_pho = utils.generate_overlapadd(np.expand_dims(pho_target, -1)) in_batches_pho = in_batches_pho.reshape([in_batches_pho.shape[0], config.batch_size, config.max_phr_len]) out_batches_feats = [] for in_batch_f0, in_batch_pho in zip(in_batches_f0, in_batches_pho) : speaker = np.repeat(singer_index, config.batch_size) feed_dict = { self.f0_placeholder: in_batch_f0,self.phoneme_labels: in_batch_pho, self.singer_labels:speaker, self.is_train: False} out_feats = sess.run(self.output, feed_dict=feed_dict) out_batches_feats.append(out_feats) out_batches_feats = np.array(out_batches_feats) out_batches_feats = utils.overlapadd(out_batches_feats,nchunks_in) out_batches_feats = out_batches_feats/2+0.5 out_batches_feats = out_batches_feats*(max_feat[:-2] - min_feat[:-2]) + min_feat[:-2] return out_batches_feats
def process_file(self, mel, speaker_index, speaker_index_2, sess): stat_file = h5py.File('./stats_yam.hdf5', mode='r') max_feat = stat_file["feats_maximus"][()] min_feat = stat_file["feats_minimus"][()] stat_file.close() mel = (mel - min_feat[:-2])/(max_feat[:-2]-min_feat[:-2]) in_batches_mel, nchunks_in = utils.generate_overlapadd(mel) out_batches_mel = [] for in_batch_mel in in_batches_mel : speaker = np.repeat(speaker_index, config.batch_size) speaker_2 = np.repeat(speaker_index_2, config.batch_size) feed_dict = {self.input_placeholder: in_batch_mel, self.speaker_labels:speaker,self.speaker_labels_1:speaker_2, self.is_train: False} mel = sess.run(self.output, feed_dict=feed_dict) out_batches_mel.append(mel) out_batches_mel = np.array(out_batches_mel) out_batches_mel = utils.overlapadd(out_batches_mel,nchunks_in) out_batches_mel = out_batches_mel*(max_feat[:-2] - min_feat[:-2]) + min_feat[:-2] return out_batches_mel
def sep_file(self, file_name, sess, plot): in_batches_stft, phase, part_stft, nchunks_in = self.read_input_file( file_name) out_batches_stft = [] for in_batch_stft in in_batches_stft: feed_dict = { self.input_placeholder: np.expand_dims(in_batch_stft, -1) } out_stft = sess.run(self.outputs, feed_dict=feed_dict) out_batches_stft.append(out_stft) out_batches_stft = np.array(out_batches_stft) out_batches_stft = utils.overlapadd( out_batches_stft.reshape(out_batches_stft.shape[0], out_batches_stft.shape[1], out_batches_stft.shape[2], -1), nchunks_in) out_batches_stft = out_batches_stft.reshape(out_batches_stft.shape[0], 513, -1) out_batches_stft = out_batches_stft[:phase.shape[0]] if plot: plt.figure(1) plt.subplot(211) plt.imshow(np.log(abs(part_stft[:, :, 0].T)), origin='lower', aspect='auto') plt.subplot(212) plt.imshow(np.log(out_batches_stft[:, :, 0].T), origin='lower', aspect='auto') plt.show() audio_1_ori = librosa.istft(part_stft[:, :, 0].T, win_length=config.nfft, hop_length=config.hopsize, window=config.window) audio_1_output = librosa.istft(out_batches_stft[:, :, 0].T, win_length=config.nfft, hop_length=config.hopsize, window=config.window) sf.write(file_name + "_1_ori.wav", audio_1_ori, int(config.fs)) sf.write(file_name + "_1_output.wav", audio_1_output, int(config.fs))
def test_wav_folder(self, folder_name, save_path): """ Function to extract multi pitch from wav files in a folder """ songs = next(os.walk(folder_name))[1] sess = tf.Session() self.load_model(sess, log_dir=config.log_dir) for song in songs: count = 0 print("Processing song %s" % song) file_list = [ x for x in os.listdir(os.path.join(folder_name, song)) if x.endswith('.wav') and not x.startswith('.') ] for file_name in file_list: in_batches_hcqt, nchunks_in, max_len = self.read_input_wav_file( os.path.join(folder_name, song, file_name)) out_batches_atb = [] for in_batch_hcqt in in_batches_hcqt: feed_dict = { self.input_placeholder: in_batch_hcqt, self.is_train: False } out_atb = sess.run(self.outputs, feed_dict=feed_dict) out_batches_atb.append(out_atb) out_batches_atb = np.array(out_batches_atb) out_batches_atb = utils.overlapadd( out_batches_atb.reshape(out_batches_atb.shape[0], config.batch_size, config.max_phr_len, -1), nchunks_in) out_batches_atb = out_batches_atb[:max_len] time_1, ori_freq = utils.process_output(out_batches_atb) utils.save_multif0_output( time_1, ori_freq, os.path.join(save_path, song, file_name[:-4] + '.csv')) count += 1 utils.progress(count, len(file_list), suffix='evaluation done')
def extract_f0_file(self, file_name, sess): in_batches_hcqt, atb, nchunks_in = self.read_input_file(file_name) out_batches_atb = [] for in_batch_hcqt in in_batches_hcqt: feed_dict = { self.input_placeholder: in_batch_hcqt, self.is_train: False } out_atb = sess.run(self.outputs, feed_dict=feed_dict) out_batches_atb.append(out_atb) out_batches_atb = np.array(out_batches_atb) out_batches_atb = utils.overlapadd( out_batches_atb.reshape(out_batches_atb.shape[0], config.batch_size, config.max_phr_len, -1), nchunks_in) out_batches_atb = out_batches_atb[:atb.shape[0]] time_1, ori_freq = utils.process_output(atb) time_2, est_freq = utils.process_output(out_batches_atb) scores = mir_eval.multipitch.evaluate(time_1, ori_freq, time_2, est_freq) return scores
def eval_file(): file_path = config.wav_dir # log_dir = './log_ikala_notrain/' log_dir = config.log_dir mode = 0 stat_file = h5py.File(config.stat_dir + 'stats.hdf5', mode='r') max_feat = np.array(stat_file["feats_maximus"]) min_feat = np.array(stat_file["feats_minimus"]) max_voc = np.array(stat_file["voc_stft_maximus"]) min_voc = np.array(stat_file["voc_stft_minimus"]) max_back = np.array(stat_file["back_stft_maximus"]) min_back = np.array(stat_file["back_stft_minimus"]) max_mix = np.array(max_voc) + np.array(max_back) with tf.Graph().as_default(): input_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len, config.input_features), name='input_placeholder') with tf.variable_scope('First_Model') as scope: harm, ap, f0, vuv = modules.nr_wavenet(input_placeholder) saver = tf.train.Saver(max_to_keep=config.max_models_to_keep) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess = tf.Session() sess.run(init_op) ckpt = tf.train.get_checkpoint_state(log_dir) if ckpt and ckpt.model_checkpoint_path: print("Using the model in %s" % ckpt.model_checkpoint_path) # saver.restore(sess, ckpt.model_checkpoint_path) saver.restore(sess, './log/model.ckpt-59') # import pdb;pdb.set_trace() files = [ x for x in os.listdir(config.wav_dir) if x.endswith('.wav') and not x.startswith('.') ] diffs = [] count = 0 for file_name in files: count += 1 mix_stft = utils.file_to_stft(os.path.join(file_path, file_name), mode=mode) targs = utils.input_to_feats(os.path.join(file_path, file_name), mode=mode) # f0_sac = utils.file_to_sac(os.path.join(file_path,file_name)) # f0_sac = (f0_sac-min_feat[-2])/(max_feat[-2]-min_feat[-2]) in_batches, nchunks_in = utils.generate_overlapadd(mix_stft) in_batches = in_batches / max_mix # in_batches = utils.normalize(in_batches, 'mix_stft', mode=config.norm_mode_in) val_outer = [] first_pred = [] cleaner = [] gan_op = [] for in_batch in in_batches: val_harm, val_ap, val_f0, val_vuv = sess.run( [harm, ap, f0, vuv], feed_dict={input_placeholder: in_batch}) if config.use_gan: val_op = sess.run(gen_op, feed_dict={input_placeholder: in_batch}) gan_op.append(val_op) # first_pred.append(harm1) # cleaner.append(val_harm) val_harm = val_harm val_outs = np.concatenate((val_harm, val_ap, val_f0, val_vuv), axis=-1) val_outer.append(val_outs) val_outer = np.array(val_outer) val_outer = utils.overlapadd(val_outer, nchunks_in) val_outer[:, -1] = np.round(val_outer[:, -1]) val_outer = val_outer[:targs.shape[0], :] val_outer = np.clip(val_outer, 0.0, 1.0) #Test purposes only # first_pred = np.array(first_pred) # first_pred = utils.overlapadd(first_pred, nchunks_in) # cleaner = np.array(cleaner) # cleaner = utils.overlapadd(cleaner, nchunks_in) f0_output = val_outer[:, -2] * ( (max_feat[-2] - min_feat[-2]) + min_feat[-2]) f0_output = f0_output * (1 - targs[:, -1]) f0_output = utils.new_base_to_hertz(f0_output) f0_gt = targs[:, -2] f0_gt = f0_gt * (1 - targs[:, -1]) f0_gt = utils.new_base_to_hertz(f0_gt) f0_outputs = [] gt_outputs = [] for i, f0_o in enumerate(f0_output): f0_outputs.append( str(i * 0.00580498866 * 10000000) + ' ' + str(f0_o)) for i, f0_o in enumerate(f0_gt): gt_outputs.append( str(i * 0.00580498866 * 10000000) + ' ' + str(f0_o)) utils.list_to_file( f0_outputs, './ikala_eval/net_out/' + file_name[:-4] + '.pv') utils.list_to_file(gt_outputs, './ikala_eval/sac_gt/' + file_name[:-4] + '.pv') # f0_difference = np.nan_to_num(abs(f0_gt-f0_output)) # f0_greater = np.where(f0_difference>config.f0_threshold) # diff_per = f0_greater[0].shape[0]/len(f0_output) # diffs.append(str(1-diff_per)) utils.progress(count, len(files))
def synth_file(file_name="015.hdf5", singer_index=0, file_path=config.wav_dir, show_plots=True): stat_file = h5py.File('./stats.hdf5', mode='r') max_feat = np.array(stat_file["feats_maximus"]) min_feat = np.array(stat_file["feats_minimus"]) with tf.Graph().as_default(): output_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len, 64), name='output_placeholder') f0_output_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len, 1), name='f0_output_placeholder') f0_input_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len), name='f0_input_placeholder') f0_onehot_labels = tf.one_hot(indices=tf.cast(f0_input_placeholder, tf.int32), depth=len(config.notes)) f0_context_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len, 1), name='f0_context_placeholder') phone_context_placeholder = tf.placeholder( tf.float32, shape=(config.batch_size, config.max_phr_len, 1), name='phone_context_placeholder') rand_input_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len, 64), name='rand_input_placeholder') prob = tf.placeholder_with_default(1.0, shape=()) phoneme_labels = tf.placeholder(tf.int32, shape=(config.batch_size, config.max_phr_len), name='phoneme_placeholder') phone_onehot_labels = tf.one_hot(indices=tf.cast( phoneme_labels, tf.int32), depth=len(config.phonemas)) with tf.variable_scope('Generator_feats') as scope: inputs = tf.concat([ phone_onehot_labels, f0_onehot_labels, phone_context_placeholder, f0_context_placeholder ], axis=-1) voc_output = modules.GAN_generator(inputs) with tf.variable_scope('Generator_f0') as scope: inputs = tf.concat([ phone_onehot_labels, f0_onehot_labels, phone_context_placeholder, f0_context_placeholder, output_placeholder ], axis=-1) # inputs = tf.concat([phone_onehot_labels, f0_onehot_labels, phone_context_placeholder, f0_context_placeholder, (voc_output/2)+0.5], axis = -1) f0_output = modules.GAN_generator_f0(inputs) scope.reuse_variables() inputs = tf.concat([ phone_onehot_labels, f0_onehot_labels, phone_context_placeholder, f0_context_placeholder, (voc_output / 2) + 0.5 ], axis=-1) f0_output_2 = modules.GAN_generator_f0(inputs) saver = tf.train.Saver(max_to_keep=config.max_models_to_keep) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess = tf.Session() sess.run(init_op) ckpt = tf.train.get_checkpoint_state(config.log_dir) if ckpt and ckpt.model_checkpoint_path: print("Using the model in %s" % ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) # saver.restore(sess, './log/model.ckpt-3999') # import pdb;pdb.set_trace() feat_file = h5py.File(config.feats_dir + file_name, "r") # speaker_file = h5py.File(config.voice_dir+speaker_file, "r") # feats = utils.input_to_feats('./54228_chorus.wav_ori_vocals.wav', mode = 1) feats = feat_file["world_feats"][()] feats = (feats - min_feat) / (max_feat - min_feat) phones = feat_file["phonemes"][()] notes = feat_file["notes"][()] phones = np.concatenate([phones, notes], axis=-1) # in_batches_f0, nchunks_in = utils.generate_overlapadd(f0_nor.reshape(-1,1)) in_batches_pho, nchunks_in = utils.generate_overlapadd(phones) in_batches_feat, kaka = utils.generate_overlapadd(feats) noters = np.expand_dims( np.array([config.notes[int(x)] for x in notes[:, 0]]), 1) out_batches_feats = [] out_batches_f0 = [] for conds, feat in zip(in_batches_pho, in_batches_feat): # import pdb;pdb.set_trace() f0 = conds[:, :, 2] phones = conds[:, :, 0] f0_context = conds[:, :, -1:] phones_context = conds[:, :, 1:2] feed_dict = { f0_input_placeholder: f0, phoneme_labels: phones, phone_context_placeholder: phones_context, f0_context_placeholder: f0_context, output_placeholder: feat[:, :, :-2] } output_feats_gan, output_f0 = sess.run([voc_output, f0_output_2], feed_dict=feed_dict) out_batches_feats.append(output_feats_gan / 2 + 0.5) out_batches_f0.append(output_f0 / 2 + 0.5) # out_batches_voc_stft_phase.append(output_voc_stft_phase) out_batches_feats = np.array(out_batches_feats) out_batches_feats = utils.overlapadd(out_batches_feats, nchunks_in) out_batches_f0 = np.array(out_batches_f0) out_batches_f0 = utils.overlapadd(out_batches_f0, nchunks_in) feats = feats * (max_feat - min_feat) + min_feat out_batches_feats = out_batches_feats * (max_feat[:-2] - min_feat[:-2]) + min_feat[:-2] out_batches_feats = out_batches_feats[:len(feats)] out_batches_f0 = out_batches_f0 * (max_feat[-2] - min_feat[-2]) + min_feat[-2] out_batches_f0 = out_batches_f0[:len(feats)] diff_1 = (out_batches_f0 - noters) * (1 - feats[:, -1:]) diff_2 = (feats[:, -2:-1] - noters) * (1 - feats[:, -1:]) print("Mean predicted note deviation {}".format(diff_1.mean())) print("Mean original note deviation {}".format(diff_2.mean())) print("STD predicted note deviation {}".format(diff_1.std())) print("STD original note deviation {}".format(diff_2.std())) plt.figure(1) plt.suptitle("F0 contour") plt.plot(out_batches_f0, label='Predicted F0') plt.plot(feats[:, -2], label="Ground Truth F0") plt.plot(noters, label="Input Midi Note") # plt.plot(phones[:,]) plt.legend() # plt.figure(2) # ax1 = plt.subplot(211) # plt.imshow(feats[:,:60].T,aspect='auto',origin='lower') # ax1.set_title("Ground Truth Vocoder Features", fontsize=10) # ax2 = plt.subplot(212, sharex = ax1, sharey = ax1) # plt.imshow(out_batches_feats[:,:60].T,aspect='auto',origin='lower') # ax2.set_title("GAN Output Vocoder Features", fontsize=10) plt.show() import pdb pdb.set_trace() # out_batches_feats_gan= out_batches_feats_gan[:len(feats)] first_op = np.concatenate( [out_batches_feats, out_batches_f0, feats[:, -1:]], axis=-1) second_op = np.concatenate( [feats[:, 60:64], out_batches_f0, feats[:, -1:]], axis=-1) # pho_op = np.concatenate([out_batches_feats_1,feats[:,-2:]], axis = -1) # gan_op = np.concatenate([out_batches_feats_gan,feats[:,-2:]], axis = -1) # import pdb;pdb.set_trace() # gan_op = np.ascontiguousarray(gan_op) # pho_op = np.ascontiguousarray(pho_op) first_op = np.ascontiguousarray(first_op) second_op = np.ascontiguousarray(second_op) utils.feats_to_audio(first_op, file_name[:-4] + '_gan_op') print("Full output saved to {}".format( os.path.join(config.val_dir, file_name[:-4] + '_gan_op.wav'))) utils.feats_to_audio(first_op, file_name[:-4] + '_F0_op') print("Only F0 saved to {}".format( os.path.join(config.val_dir, file_name[:-4] + '_F0_op.wav')))
def evalNetwork(file_name, load_name='model_e4000_b50_bs5_1709', plot=False, synth=False): autoencoder_audio = AutoEncoder().cuda() epoch = 50 eps = 1e-30 # autoencoder_audio.load_state_dict(torch.load(config.log_dir+load_name+'_'+str(epoch)+'.pt')) autoencoder_audio.load_state_dict( torch.load('./log/model_e8000_b50_bs5_3369.pt')) stat_file = h5py.File(config.stat_dir + 'stats.hdf5', mode='r') ''' import pdb;pdb.set_trace() enc = autoencoder_audio.encoder weight = enc[0].weight.data.cpu().numpy() plt.imshow(weight[0,0,:,:]) ''' max_feat = np.array(stat_file["feats_maximus"]) min_feat = np.array(stat_file["feats_minimus"]) max_feat_tars = max_feat[:8, :].reshape(8, 1, 513) min_feat_tars = min_feat[:8, :].reshape(8, 1, 513) max_feat_ins = max_feat[-2:, :].reshape(2, 1, 513) min_feat_ins = min_feat[-2:, :].reshape(2, 1, 513) audio, fs = stempeg.read_stems(os.path.join(config.wav_dir_test, file_name), stem_id=[0, 1, 2, 3, 4]) mixture = audio[0] drums = audio[1] bass = audio[2] acc = audio[3] vocals = audio[4] mix_stft, mix_phase = utils.stft_stereo(mixture, phase=True) mix_stft = (mix_stft - min_feat_ins) / (max_feat_ins - min_feat_ins) drums_stft = utils.stft_stereo(drums) bass_stft = utils.stft_stereo(bass) acc_stft = utils.stft_stereo(acc) voc_stft = utils.stft_stereo(vocals) in_batches, nchunks_in = utils.generate_overlapadd(mix_stft) out_batches = [] for in_batch in in_batches: # import pdb;pdb.set_trace() in_batch = Variable(torch.FloatTensor(in_batch)).cuda() out_batch = autoencoder_audio(in_batch) out_batches.append(np.array(out_batch.data.cpu().numpy())) out_batches = np.array(out_batches) #out_batches[out_batches == 0] = 1e-6 vocals = out_batches[:, :, :2, :, :] drums = out_batches[:, :, 2:4, :, :] bass = out_batches[:, :, 4:6, :, :] others = out_batches[:, :, 6:, :, :] total_sources = vocals + bass + drums + others total_sources = total_sources mask_vocals = vocals / total_sources mask_drums = drums / total_sources mask_bass = bass / total_sources mask_others = 1 - (mask_vocals + mask_drums + mask_bass) out_vocals = in_batches * mask_vocals out_drums = in_batches * mask_drums out_bass = in_batches * mask_bass out_others = in_batches * mask_others out_vocals = out_vocals * ( max_feat_tars[:2, :, :] - min_feat_tars[:2, :, :]) + min_feat_tars[:2, :, :] out_drums = out_drums * (max_feat_tars[2:4, :, :] - min_feat_tars[ 2:4, :, :]) + min_feat_tars[2:4, :, :] out_bass = out_bass * (max_feat_tars[4:6, :, :] - min_feat_tars[4:6, :, :]) + min_feat_tars[4:6, :, :] out_others = out_others * (max_feat_tars[6:, :, :] - min_feat_tars[ 6:, :, :]) + min_feat_tars[6:, :, :] out_drums = utils.overlapadd(out_drums, nchunks_in) out_bass = utils.overlapadd(out_bass, nchunks_in) out_others = utils.overlapadd(out_others, nchunks_in) out_vocals = utils.overlapadd(out_vocals, nchunks_in) if plot: plt.figure(1) plt.suptitle(file_name[:-9]) ax1 = plt.subplot(411) plt.imshow(np.log(drums_stft[0].T), aspect='auto', origin='lower') ax1.set_title("Drums Left Channel Ground Truth", fontsize=10) ax2 = plt.subplot(412, sharex=ax1, sharey=ax1) plt.imshow(np.log(out_drums[0].T), aspect='auto', origin='lower') ax2.set_title("Drums Left Channel Network Output", fontsize=10) ax3 = plt.subplot(413, sharex=ax1, sharey=ax1) plt.imshow(np.log(drums_stft[1].T), aspect='auto', origin='lower') ax3.set_title("Drums Right Channel Ground Truth", fontsize=10) ax4 = plt.subplot(414, sharex=ax1, sharey=ax1) plt.imshow(np.log(out_drums[1].T), aspect='auto', origin='lower') ax4.set_title("Drums Right Channel Network Output", fontsize=10) plt.figure(2) plt.suptitle(file_name[:-9]) ax1 = plt.subplot(411) plt.imshow(np.log(voc_stft[0].T), aspect='auto', origin='lower') ax1.set_title("Vocals Left Channel Ground Truth", fontsize=10) ax2 = plt.subplot(412, sharex=ax1, sharey=ax1) plt.imshow(np.log(out_vocals[0].T), aspect='auto', origin='lower') ax2.set_title("Vocals Left Channel Network Output", fontsize=10) ax3 = plt.subplot(413, sharex=ax1, sharey=ax1) plt.imshow(np.log(voc_stft[1].T), aspect='auto', origin='lower') ax3.set_title("Vocals Right Channel Ground Truth", fontsize=10) ax4 = plt.subplot(414, sharex=ax1, sharey=ax1) plt.imshow(np.log(out_vocals[1].T), aspect='auto', origin='lower') ax4.set_title("Vocals Right Channel Network Output", fontsize=10) plt.figure(3) plt.suptitle(file_name[:-9]) ax1 = plt.subplot(411) plt.imshow(np.log(bass_stft[0].T), aspect='auto', origin='lower') ax1.set_title("Bass Left Channel Ground Truth", fontsize=10) ax2 = plt.subplot(412, sharex=ax1, sharey=ax1) plt.imshow(np.log(out_bass[0].T), aspect='auto', origin='lower') ax2.set_title("Bass Left Channel Network Output", fontsize=10) ax3 = plt.subplot(413, sharex=ax1, sharey=ax1) plt.imshow(np.log(bass_stft[1].T), aspect='auto', origin='lower') ax3.set_title("Bass Right Channel Ground Truth", fontsize=10) ax4 = plt.subplot(414, sharex=ax1, sharey=ax1) plt.imshow(np.log(out_bass[1].T), aspect='auto', origin='lower') ax4.set_title("Bass Right Channel Network Output", fontsize=10) plt.figure(4) plt.suptitle(file_name[:-9]) ax1 = plt.subplot(411) plt.imshow(np.log(acc_stft[0].T), aspect='auto', origin='lower') ax1.set_title("Others Left Channel Ground Truth", fontsize=10) ax2 = plt.subplot(412, sharex=ax1, sharey=ax1) plt.imshow(np.log(out_others[0].T), aspect='auto', origin='lower') ax2.set_title("Others Left Channel Network Output", fontsize=10) ax3 = plt.subplot(413, sharex=ax1, sharey=ax1) plt.imshow(np.log(acc_stft[1].T), aspect='auto', origin='lower') ax3.set_title("Others Right Channel Ground Truth", fontsize=10) ax4 = plt.subplot(414, sharex=ax1, sharey=ax1) plt.imshow(np.log(out_others[1].T), aspect='auto', origin='lower') ax4.set_title("Others Right Channel Network Output", fontsize=10) plt.show() if synth: # import pdb;pdb.set_trace() utils.inverse_stft_write(out_drums[:, :mix_phase.shape[1], :], mix_phase, config.out_dir + file_name + "_drums.wav") utils.inverse_stft_write(out_bass[:, :mix_phase.shape[1], :], mix_phase, config.out_dir + file_name + "_bass.wav") utils.inverse_stft_write(out_vocals[:, :mix_phase.shape[1], :], mix_phase, config.out_dir + file_name + "_vocals.wav") utils.inverse_stft_write(out_others[:, :mix_phase.shape[1], :], mix_phase, config.out_dir + file_name + "_others.wav")
def evalNetwork(file_name='Al James - Schoolboy Facination.stem.mp4', load_name_sep='model6', load_name_dn='dn_model_719', plot=True, synth=False): autoencoder_audio = AutoEncoder().cuda() denoiser = Encoder().cuda() epoch = 50 autoencoder_audio.load_state_dict( torch.load(config.log_dir + load_name_sep + '.pt')) denoiser.load_state_dict( torch.load(config.dn_log_dir + load_name_dn + '.pt')) stat_file = h5py.File(config.stat_dir + 'stats.hdf5', mode='r') max_feat = np.array(stat_file["feats_maximus"]) min_feat = np.array(stat_file["feats_minimus"]) max_feat_tars = max_feat[:8, :].reshape(8, 1, 513) min_feat_tars = min_feat[:8, :].reshape(8, 1, 513) max_feat_ins = max_feat[-2:, :].reshape(2, 1, 513) min_feat_ins = min_feat[-2:, :].reshape(2, 1, 513) audio, fs = stempeg.read_stems(os.path.join(config.wav_dir_test, file_name), stem_id=[0, 1, 2, 3, 4]) mixture = audio[0] drums = audio[1] bass = audio[2] acc = audio[3] vocals = audio[4] mix_stft, mix_phase = utils.stft_stereo(mixture, phase=True) mix_stft = (mix_stft - min_feat_ins) / (max_feat_ins - min_feat_ins) drums_stft = utils.stft_stereo(drums) bass_stft = utils.stft_stereo(bass) acc_stft = utils.stft_stereo(acc) voc_stft = utils.stft_stereo(vocals) in_batches, nchunks_in = utils.generate_overlapadd(mix_stft) out_batches = [] for in_batch in in_batches: # import pdb;pdb.set_trace() in_batch = Variable(torch.FloatTensor(in_batch)).cuda() out_batch = autoencoder_audio(in_batch) out_batches.append(np.array(out_batch.data.cpu().numpy())) out_batches = np.array(out_batches) vocals = out_batches[:, :, :2, :, :] drums = out_batches[:, :, 2:4, :, :] bass = out_batches[:, :, 4:6, :, :] others = out_batches[:, :, 6:, :, :] total_sources = vocals + bass + drums + others mask_vocals = vocals / total_sources mask_drums = drums / total_sources mask_bass = bass / total_sources mask_others = 1 - (mask_vocals + mask_drums + mask_bass) out_vocals = in_batches * mask_vocals out_drums = in_batches * mask_drums out_bass = in_batches * mask_bass out_others = in_batches * mask_others out_vocals_2 = out_vocals * ( max_feat_tars[:2, :, :] - min_feat_tars[:2, :, :]) + min_feat_tars[:2, :, :] out_drums = out_drums * (max_feat_tars[2:4, :, :] - min_feat_tars[ 2:4, :, :]) + min_feat_tars[2:4, :, :] out_bass = out_bass * (max_feat_tars[4:6, :, :] - min_feat_tars[4:6, :, :]) + min_feat_tars[4:6, :, :] out_others = out_others * (max_feat_tars[6:, :, :] - min_feat_tars[ 6:, :, :]) + min_feat_tars[6:, :, :] out_batches_vocals = [] #print (np.array(out_vocals_2).shape) for vocal_batch in range(vocals.shape[0]): vocal_batch = Variable( torch.FloatTensor(out_vocals_2[vocal_batch, :, :])).cuda() out_batch = denoiser(vocal_batch) out_batches_vocals.append(np.array(out_batch.data.cpu().numpy())) out_vocals_2 = utils.overlapadd(out_vocals_2, nchunks_in) out_vocals = utils.overlapadd(np.array(out_batches_vocals), nchunks_in) #out_vocals = out_vocals*(max_feat_tars[:2,:,:]-min_feat_tars[:2,:,:])+min_feat_tars[:2,:,:] print(out_vocals.shape) if plot: plt.figure(1) ax1 = plt.subplot(411) plt.imshow(np.log(out_vocals_2[0].T), aspect='auto', origin='lower') ax1.set_title("Vocals Left Channel Input", fontsize=10) ax2 = plt.subplot(412, sharex=ax1, sharey=ax1) plt.imshow(np.log(out_vocals[0].T), aspect='auto', origin='lower') ax2.set_title("Vocals Left Channel Network Output", fontsize=10) ax3 = plt.subplot(413, sharex=ax1, sharey=ax1) plt.imshow(np.log(out_vocals_2[1].T), aspect='auto', origin='lower') ax3.set_title("Vocals Right Channel Input", fontsize=10) ax4 = plt.subplot(414, sharex=ax1, sharey=ax1) plt.imshow(np.log(out_vocals[1].T), aspect='auto', origin='lower') ax4.set_title("Vocals Right Channel Network Output", fontsize=10) plt.show()
def train(_): stat_file = h5py.File(config.stat_dir+'stats.hdf5', mode='r') max_feat = np.array(stat_file["feats_maximus"]) min_feat = np.array(stat_file["feats_minimus"]) with tf.Graph().as_default(): input_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size,config.max_phr_len,config.input_features),name='input_placeholder') tf.summary.histogram('inputs', input_placeholder) target_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size,config.max_phr_len,3),name='target_placeholder') tf.summary.histogram('targets', target_placeholder) with tf.variable_scope('First_Model') as scope: f0, f0_1, vuv = modules.f0_network(input_placeholder) # tf.summary.histogram('initial_output', op) # tf.summary.histogram('harm', harm) # tf.summary.histogram('ap', ap) tf.summary.histogram('f0', f0) tf.summary.histogram('vuv', vuv) # initial_loss = tf.reduce_sum(tf.abs(op - target_placeholder[:,:,:60])*np.linspace(1.0,0.7,60)*(1-target_placeholder[:,:,-1:])) # harm_loss = tf.reduce_sum(tf.abs(harm - target_placeholder[:,:,:60])*np.linspace(1.0,0.7,60)*(1-target_placeholder[:,:,-1:])) # ap_loss = tf.reduce_sum(tf.abs(ap - target_placeholder[:,:,60:-2])*(1-target_placeholder[:,:,-1:])) f0_loss_1 = tf.reduce_sum(tf.abs(f0 - target_placeholder[:,:,-3:-2])*(1-target_placeholder[:,:,-1:])) f0_loss_2 = tf.reduce_sum(tf.abs(f0_1 - target_placeholder[:,:,-2:-1])*(1-target_placeholder[:,:,-1:])) # vuv_loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=, logits=vuv)) vuv_loss = tf.reduce_sum(binary_cross(target_placeholder[:,:,-1:],vuv)) loss = f0_loss_1 + vuv_loss + f0_loss_2 # initial_summary = tf.summary.scalar('initial_loss', initial_loss) # harm_summary = tf.summary.scalar('harm_loss', harm_loss) # ap_summary = tf.summary.scalar('ap_loss', ap_loss) f0_summary_1 = tf.summary.scalar('f0_loss_1', f0_loss_1) f0_summary_2 = tf.summary.scalar('f0_loss_2', f0_loss_2) vuv_summary = tf.summary.scalar('vuv_loss', vuv_loss) loss_summary = tf.summary.scalar('total_loss', loss) global_step = tf.Variable(0, name='global_step', trainable=False) optimizer = tf.train.AdamOptimizer(learning_rate = config.init_lr) # optimizer_f0 = tf.train.AdamOptimizer(learning_rate = config.init_lr) train_function = optimizer.minimize(loss, global_step= global_step) # train_f0 = optimizer.minimize(f0_loss, global_step= global_step) # train_harm = optimizer.minimize(harm_loss, global_step= global_step) # train_ap = optimizer.minimize(ap_loss, global_step= global_step) # train_f0 = optimizer.minimize(f0_loss, global_step= global_step) # train_vuv = optimizer.minimize(vuv_loss, global_step= global_step) summary = tf.summary.merge_all() init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) saver = tf.train.Saver(max_to_keep= config.max_models_to_keep) sess = tf.Session() sess.run(init_op) ckpt = tf.train.get_checkpoint_state(config.log_dir) if ckpt and ckpt.model_checkpoint_path: print("Using the model in %s"%ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) train_summary_writer = tf.summary.FileWriter(config.log_dir+'train/', sess.graph) val_summary_writer = tf.summary.FileWriter(config.log_dir+'val/', sess.graph) start_epoch = int(sess.run(tf.train.get_global_step())/(config.batches_per_epoch_train)) print("Start from: %d" % start_epoch) f0_accs = [] for epoch in xrange(start_epoch, config.num_epochs): val_f0_accs_1 = [] val_f0_accs_2 = [] data_generator = data_gen() start_time = time.time() epoch_loss_harm = 0 epoch_loss_ap = 0 epoch_loss_f0_1 = 0 epoch_loss_f0_2 = 0 epoch_loss_vuv = 0 epoch_total_loss = 0 # epoch_initial_loss = 0 epoch_loss_harm_val = 0 epoch_loss_ap_val = 0 epoch_loss_f0_val_1 = 0 epoch_loss_f0_val_2 = 0 epoch_loss_vuv_val = 0 epoch_total_loss_val = 0 # epoch_initial_loss_val = 0 if config.use_gan: epoch_loss_generator_GAN = 0 epoch_loss_generator_diff = 0 epoch_loss_discriminator_real = 0 epoch_loss_discriminator_fake = 0 val_epoch_loss_generator_GAN = 0 val_epoch_loss_generator_diff = 0 val_epoch_loss_discriminator_real = 0 val_epoch_loss_discriminator_fake = 0 batch_num = 0 batch_num_val = 0 val_generator = data_gen(mode='val') # val_generator = get_batches(train_filename=config.h5py_file_val, batches_per_epoch=config.batches_per_epoch_val) with tf.variable_scope('Training'): for voc, feat in data_generator: _, step_loss_f0_1,step_loss_f0_2, step_loss_vuv, step_total_loss = sess.run([train_function, f0_loss_1,f0_loss_2, vuv_loss, loss], feed_dict={input_placeholder: voc,target_placeholder: feat}) # _, step_loss_f0 = sess.run([train_f0, f0_loss], feed_dict={input_placeholder: voc,target_placeholder: feat}) if config.use_gan: _, step_dis_loss_real, step_dis_loss_fake = sess.run([d_optimizer, D_loss_real,D_loss_fake], feed_dict={input_placeholder: voc,target_placeholder: feat}) _, step_gen_loss_GAN, step_gen_loss_diff = sess.run([g_optimizer, G_loss_GAN, G_loss_diff], feed_dict={input_placeholder: voc,target_placeholder: feat}) # else : # _, step_dis_loss_real, step_dis_loss_fake = sess.run([d_optimizer_grad, D_loss_real,D_loss_fake], feed_dict={input_placeholder: voc,target_placeholder: feat}) # _, step_gen_loss_diff = sess.run([g_optimizer_diff, G_loss_diff], feed_dict={input_placeholder: voc,target_placeholder: feat}) # step_gen_loss_GAN = 0 # _, step_loss_harm = sess.run([train_harm, harm_loss], feed_dict={input_placeholder: voc,target_placeholder: feat}) # _, step_loss_ap = sess.run([train_ap, ap_loss], feed_dict={input_placeholder: voc,target_placeholder: feat}) # _, step_loss_f0 = sess.run([train_f0, f0_loss], feed_dict={input_placeholder: voc,target_placeholder: feat}) # _, step_loss_vuv = sess.run([train_vuv, vuv_loss], feed_dict={input_placeholder: voc,target_placeholder: feat}) # epoch_initial_loss+=step_initial_loss # epoch_loss_harm+=step_loss_harm # epoch_loss_ap+=step_loss_ap epoch_loss_f0_1+=step_loss_f0_1 epoch_loss_f0_2+=step_loss_f0_2 epoch_loss_vuv+=step_loss_vuv epoch_total_loss+=step_total_loss if config.use_gan: epoch_loss_generator_GAN+=step_gen_loss_GAN epoch_loss_generator_diff+=step_gen_loss_diff epoch_loss_discriminator_real+=step_dis_loss_real epoch_loss_discriminator_fake+=step_dis_loss_fake utils.progress(batch_num,config.batches_per_epoch_train, suffix = 'training done') batch_num+=1 # epoch_initial_loss = epoch_initial_loss/(config.batches_per_epoch_train *config.batch_size*config.max_phr_len*60) # epoch_loss_harm = epoch_loss_harm/(config.batches_per_epoch_train *config.batch_size*config.max_phr_len*60) # epoch_loss_ap = epoch_loss_ap/(config.batches_per_epoch_train *config.batch_size*config.max_phr_len*4) epoch_loss_f0_1 = epoch_loss_f0_1/(config.batches_per_epoch_train *config.batch_size*config.max_phr_len) epoch_loss_f0_2 = epoch_loss_f0_2/(config.batches_per_epoch_train *config.batch_size*config.max_phr_len) epoch_loss_vuv = epoch_loss_vuv/(config.batches_per_epoch_train *config.batch_size*config.max_phr_len) epoch_total_loss = epoch_total_loss/(config.batches_per_epoch_train *config.batch_size*config.max_phr_len*3) if config.use_gan: epoch_loss_generator_GAN = epoch_loss_generator_GAN/(config.batches_per_epoch_train *config.batch_size) epoch_loss_generator_diff = epoch_loss_generator_diff/(config.batches_per_epoch_train *config.batch_size*config.max_phr_len*60) epoch_loss_discriminator_real = epoch_loss_discriminator_real/(config.batches_per_epoch_train *config.batch_size) epoch_loss_discriminator_fake = epoch_loss_discriminator_fake/(config.batches_per_epoch_train *config.batch_size) summary_str = sess.run(summary, feed_dict={input_placeholder: voc,target_placeholder: feat}) train_summary_writer.add_summary(summary_str, epoch) # summary_writer.add_summary(summary_str_val, epoch) train_summary_writer.flush() with tf.variable_scope('Validation'): for voc, feat,nchunks_in, lent, county, max_count in val_generator: if (epoch + 1) % config.print_every == 0 or (epoch + 1) == config.num_epochs: if county == 1: f0_gt = [] vuv_gt = [] f0_output_1 = [] f0_output_2 = [] f0_op_1, f0_op_2 = sess.run([f0,f0_1],feed_dict={input_placeholder: voc,target_placeholder: feat}) f0_output_1.append(f0_op_1) f0_output_2.append(f0_op_2) f0_gt.append(feat[:,:,-2:-1]) vuv_gt.append(feat[:,:,-1:]) if county == max_count: f0_output_1 = utils.overlapadd(np.array(f0_output_1), nchunks_in) f0_output_2 = utils.overlapadd(np.array(f0_output_2), nchunks_in) f0_gt = utils.overlapadd(np.array(f0_gt), nchunks_in) vuv_gt = utils.overlapadd(np.array(vuv_gt), nchunks_in) f0_output_1 = f0_output_1[:lent] f0_output_2 = f0_output_2[:lent] f0_gt = f0_gt[:lent] vuv_gt = vuv_gt[:lent] f0_output_1 = f0_output_1*((max_feat[-2]-min_feat[-2])+min_feat[-2])*(1-vuv_gt) f0_output_2 = f0_output_2*((max_feat[-2]-min_feat[-2])+min_feat[-2])*(1-vuv_gt) f0_gt = f0_gt*((max_feat[-2]-min_feat[-2])+min_feat[-2])*(1-vuv_gt) # f0_output_1[f0_output_1 == 0] = np.nan # f0_gt[f0_gt == 0] = np.nan f0_difference_1 = np.nan_to_num(abs(f0_gt-f0_output_1)) f0_greater_1 = np.where(f0_difference_1>config.f0_threshold) diff_per_1 = f0_greater_1[0].shape[0]/len(f0_output_1) val_f0_accs_1.append(1 - diff_per_1) f0_difference_2 = np.nan_to_num(abs(f0_gt-f0_output_2)) f0_greater_2 = np.where(f0_difference_2>config.f0_threshold) diff_per_2 = f0_greater_2[0].shape[0]/len(f0_output_2) val_f0_accs_2.append(1 - diff_per_2) # import pdb;pdb.set_trace() # step_initial_loss_val = sess.run(initial_loss, feed_dict={input_placeholder: voc,target_placeholder: feat}) # step_loss_harm_val = sess.run(harm_loss, feed_dict={input_placeholder: voc,target_placeholder: feat}) # step_loss_ap_val = sess.run(ap_loss, feed_dict={input_placeholder: voc,target_placeholder: feat}) step_loss_f0_val_1 = sess.run(f0_loss_1, feed_dict={input_placeholder: voc,target_placeholder: feat}) step_loss_f0_val_2 = sess.run(f0_loss_2, feed_dict={input_placeholder: voc,target_placeholder: feat}) step_loss_vuv_val = sess.run(vuv_loss, feed_dict={input_placeholder: voc,target_placeholder: feat}) step_total_loss_val = sess.run(loss, feed_dict={input_placeholder: voc,target_placeholder: feat}) if config.use_gan: step_gen_loss_GAN, step_gen_loss_diff = sess.run([G_loss_GAN, G_loss_diff], feed_dict={input_placeholder: voc,target_placeholder: feat}) step_dis_loss_real,step_dis_loss_fake = sess.run([D_loss_real,D_loss_fake], feed_dict={input_placeholder: voc,target_placeholder: feat}) # epoch_initial_loss_val+=step_initial_loss_val # epoch_loss_harm_val+=step_loss_harm_val # epoch_loss_ap_val+=step_loss_ap_val epoch_loss_f0_val_1+=step_loss_f0_val_1 epoch_loss_f0_val_2+=step_loss_f0_val_2 epoch_loss_vuv_val+=step_loss_vuv_val epoch_total_loss_val+=step_total_loss_val if config.use_gan: val_epoch_loss_generator_GAN += step_gen_loss_GAN val_epoch_loss_generator_diff += step_gen_loss_diff val_epoch_loss_discriminator_real += step_dis_loss_real val_epoch_loss_discriminator_fake += step_dis_loss_fake utils.progress(batch_num_val,config.batches_per_epoch_val, suffix = 'validiation done') batch_num_val+=1 if (epoch + 1) % config.print_every == 0 or (epoch + 1) == config.num_epochs: f0_accs.append(np.mean(val_f0_accs_2)) # epoch_initial_loss_val = epoch_initial_loss_val/(config.batches_per_epoch_val *config.batch_size*config.max_phr_len*60) # epoch_loss_harm_val = epoch_loss_harm_val/(batch_num_val *config.batch_size*config.max_phr_len*60) # epoch_loss_ap_val = epoch_loss_ap_val/(batch_num_val *config.batch_size*config.max_phr_len*4) epoch_loss_f0_val_1 = epoch_loss_f0_val_1/(batch_num_val *config.batch_size*config.max_phr_len) epoch_loss_f0_val_2 = epoch_loss_f0_val_2/(batch_num_val *config.batch_size*config.max_phr_len) epoch_loss_vuv_val = epoch_loss_vuv_val/(batch_num_val *config.batch_size*config.max_phr_len) epoch_total_loss_val = epoch_total_loss_val/(batch_num_val *config.batch_size*config.max_phr_len*66) if config.use_gan: val_epoch_loss_generator_GAN = val_epoch_loss_generator_GAN/(config.batches_per_epoch_val *config.batch_size) val_epoch_loss_generator_diff = val_epoch_loss_generator_diff/(config.batches_per_epoch_val *config.batch_size*config.max_phr_len*60) val_epoch_loss_discriminator_real = val_epoch_loss_discriminator_real/(config.batches_per_epoch_val *config.batch_size) val_epoch_loss_discriminator_fake = val_epoch_loss_discriminator_fake/(config.batches_per_epoch_val *config.batch_size) summary_str = sess.run(summary, feed_dict={input_placeholder: voc,target_placeholder: feat}) val_summary_writer.add_summary(summary_str, epoch) # summary_writer.add_summary(summary_str_val, epoch) val_summary_writer.flush() duration = time.time() - start_time np.save('./ikala_eval/accuracies', f0_accs) if (epoch+1) % config.print_every == 0: print('epoch %d: F0 Training Loss = %.10f (%.3f sec)' % (epoch+1, epoch_loss_f0_1, duration)) # print(' : Ap Training Loss = %.10f ' % (epoch_loss_ap)) # print(' : F0 Training Loss = %.10f ' % (epoch_loss_f0)) print(' : VUV Training Loss = %.10f ' % (epoch_loss_vuv)) # print(' : Initial Training Loss = %.10f ' % (epoch_initial_loss)) if config.use_gan: print(' : Gen GAN Training Loss = %.10f ' % (epoch_loss_generator_GAN)) print(' : Gen diff Training Loss = %.10f ' % (epoch_loss_generator_diff)) print(' : Discriminator Training Loss Real = %.10f ' % (epoch_loss_discriminator_real)) print(' : Discriminator Training Loss Fake = %.10f ' % (epoch_loss_discriminator_fake)) # print(' : Harm Validation Loss = %.10f ' % (epoch_loss_harm_val)) # print(' : Ap Validation Loss = %.10f ' % (epoch_loss_ap_val)) print(' : F0 Validation Loss_1 = %.10f ' % (epoch_loss_f0_val_1)) print(' : F0 Validation Loss_2 = %.10f ' % (epoch_loss_f0_val_2)) print(' : VUV Validation Loss = %.10f ' % (epoch_loss_vuv_val)) if (epoch + 1) % config.print_every == 0 or (epoch + 1) == config.num_epochs: print(' : Mean F0 IKala Accuracy_1 = %.10f ' % (np.mean(val_f0_accs_1))) print(' : Mean F0 IKala Accuracy_2 = %.10f ' % (np.mean(val_f0_accs_2))) # print(' : Mean F0 IKala Accuracy = '+'%{1:.{0}f}%'.format(np.mean(val_f0_accs))) # print(' : Initial Validation Loss = %.10f ' % (epoch_initial_loss_val)) if config.use_gan: print(' : Gen GAN Validation Loss = %.10f ' % (val_epoch_loss_generator_GAN)) print(' : Gen diff Validation Loss = %.10f ' % (val_epoch_loss_generator_diff)) print(' : Discriminator Validation Loss Real = %.10f ' % (val_epoch_loss_discriminator_real)) print(' : Discriminator Validation Loss Fake = %.10f ' % (val_epoch_loss_discriminator_fake)) if (epoch + 1) % config.save_every == 0 or (epoch + 1) == config.num_epochs: # utils.list_to_file(val_f0_accs,'./ikala_eval/accuracies_'+str(epoch+1)+'.txt') checkpoint_file = os.path.join(config.log_dir, 'model.ckpt') saver.save(sess, checkpoint_file, global_step=epoch)
def process_file(self, voc_stft, voc_stft_singer, sess): stat_file = h5py.File(config.stat_dir+'stats.hdf5', mode='r') max_voc = np.array(stat_file["voc_stft_maximus"]) min_voc = np.array(stat_file["voc_stft_minimus"]) max_feat = np.array(stat_file["feats_maximus"]) min_feat = np.array(stat_file["feats_minimus"]) stat_file.close() if len(voc_stft)>len(voc_stft_singer): voc_stft = voc_stft[:len(voc_stft_singer)] else: voc_stft_singer = voc_stft_singer[:len(voc_stft)] in_batches_stft, nchunks_in = utils.generate_overlapadd(voc_stft) in_batches_stft_singer, nchunks_in_singer = utils.generate_overlapadd(voc_stft_singer) in_batches_stft = np.clip(in_batches_stft, 0.0, 1.0) in_batches_stft_singer = np.clip(in_batches_stft_singer, 0.0, 1.0) out_batches_feats = [] out_batches_f0 = [] out_batches_singer = [] # for in_batch_stft_singer in in_batches_stft_singer: # feed_dict = {self.input_placeholder_singer: in_batch_stft_singer, self.is_train: False} # singer_est = sess.run(self.singer_probs, feed_dict=feed_dict) # out_batches_singer.append(singer_est) # singer_emb = np.tile(np.mean(np.mean(np.array(out_batches_singer), axis = 0), axis = 0), [config.batch_size, 1]) # singer_emb = np.tile(one_hotize(np.argmax(np.mean(np.mean(np.array(out_batches_singer), axis = 0), axis = 0), axis = -1), config.num_singers), [config.batch_size, 1]) # pho_est = one_hotize(np.argmax(pho_est, axis = -1), config.num_phos) for in_batch_stft, in_batch_stft_singer in zip(in_batches_stft, in_batches_stft_singer) : feed_dict = {self.input_placeholder: in_batch_stft, self.is_train: False} f0_est, pho_est = sess.run([self.f0_probs, self.pho_probs], feed_dict=feed_dict) feed_dict = {self.input_placeholder: in_batch_stft, self.input_placeholder_singer: in_batch_stft_singer, self.f0_onehot_labels: f0_est, self.phone_onehot_labels: pho_est, self.is_train: False} out_feats = sess.run(self.output, feed_dict=feed_dict) out_batches_feats.append(out_feats) out_batches_f0.append(f0_est) out_batches_feats = np.array(out_batches_feats) out_batches_feats = utils.overlapadd(out_batches_feats,nchunks_in) # out_batches_wav = utils.overlapadd(out_batches_wav,nchunks_in) # out_batches_wav = utils.overlapadd(np.expand_dims(out_batches_wav, -1),nchunks_in, overlap = config.max_phr_len*2**7) # out_batches_wav = out_batches_wav *2 -1 out_batches_feats = out_batches_feats*(max_feat[:-1] - min_feat[:-1]) + min_feat[:-1] return out_batches_feats
def extract_part_from_file(self, file_name, part, sess): parts = ['_soprano_', '_alto_', '_bass_', '_tenor_'] cqt = self.read_input_file(file_name) song_name = file_name.split('_')[0] voc_num = 9 - part voc_part = parts[part] voc_track = file_name[-voc_num] voc_feat_file = h5py.File( config.voc_feats_dir + song_name + voc_part + voc_track + '.wav.hdf5', 'r') voc_feats = voc_feat_file["voc_feats"][()] voc_feats[np.argwhere(np.isnan(voc_feats))] = 0.0 atb = voc_feat_file['atb'][()] atb = atb[:, 1:] atb[:, 0:4] = 0 atb = np.clip(atb, 0.0, 1.0) max_len = min(len(voc_feats), len(cqt)) voc_feats = voc_feats[:max_len] cqt = cqt[:max_len] atb = atb[:max_len] # voc_feats = (voc_feats - min_feat) / (max_feat - min_feat) # # voc_feats = np.clip(voc_feats[:, :, :-2], 0.0, 0.1) # sig_process.feats_to_audio(voc_feats, 'booboo.wav') in_batches_cqt, nchunks_in = utils.generate_overlapadd(cqt) in_batches_atb, nchunks_in = utils.generate_overlapadd(atb) # import pdb;pdb.set_trace() out_batches_feats = [] for in_batch_cqt, in_batch_atb in zip(in_batches_cqt, in_batches_atb): feed_dict = { self.input_placeholder: in_batch_cqt, self.f0_placeholder: in_batch_atb, self.is_train: False } out_feat = sess.run(self.output_logits, feed_dict=feed_dict) out_batches_feats.append(out_feat) out_batches_feats = np.array(out_batches_feats) out_feats = utils.overlapadd( out_batches_feats.reshape(out_batches_feats.shape[0], config.batch_size, config.max_phr_len, -1), nchunks_in) out_feats = out_feats * (max_feat - min_feat) + min_feat out_feats = out_feats[:max_len] out_feats = np.concatenate((out_feats, voc_feats[:, -2:]), axis=-1) plt.figure(1) plt.subplot(211) plt.imshow(voc_feats.T, origin='lower', aspect='auto') plt.subplot(212) plt.imshow(out_feats.T, origin='lower', aspect='auto') plt.show() sig_process.feats_to_audio(out_feats, 'extracted.wav') import pdb pdb.set_trace()
def synth_file(file_name, file_path=config.wav_dir, show_plots=True, save_file=True): if file_name.startswith('ikala'): file_name = file_name[6:] file_path = config.wav_dir utils.write_ori_ikala(os.path.join(file_path, file_name), file_name) mode = 0 elif file_name.startswith('mir'): file_name = file_name[4:] file_path = config.wav_dir_mir utils.write_ori_ikala(os.path.join(file_path, file_name), file_name) mode = 0 elif file_name.startswith('med'): file_name = file_name[4:] file_path = config.wav_dir_med utils.write_ori_med(os.path.join(file_path, file_name), file_name) mode = 2 else: mode = 1 file_path = './' stat_file = h5py.File(config.stat_dir + 'stats.hdf5', mode='r') max_feat = np.array(stat_file["feats_maximus"]) min_feat = np.array(stat_file["feats_minimus"]) max_voc = np.array(stat_file["voc_stft_maximus"]) min_voc = np.array(stat_file["voc_stft_minimus"]) max_back = np.array(stat_file["back_stft_maximus"]) min_back = np.array(stat_file["back_stft_minimus"]) max_mix = np.array(max_voc) + np.array(max_back) with tf.Graph().as_default(): input_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len, config.input_features), name='input_placeholder') with tf.variable_scope('First_Model') as scope: harm, ap, f0, vuv = modules.nr_wavenet(input_placeholder) # harmy = harm_1+harm if config.use_gan: with tf.variable_scope('Generator') as scope: gen_op = modules.GAN_generator(harm) # with tf.variable_scope('Discriminator') as scope: # D_real = modules.GAN_discriminator(target_placeholder[:,:,:60],input_placeholder) # scope.reuse_variables() # D_fake = modules.GAN_discriminator(gen_op,input_placeholder) saver = tf.train.Saver(max_to_keep=config.max_models_to_keep) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess = tf.Session() sess.run(init_op) ckpt = tf.train.get_checkpoint_state(config.log_dir_m1) if ckpt and ckpt.model_checkpoint_path: print("Using the model in %s" % ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) mix_stft = utils.file_to_stft(os.path.join(file_path, file_name), mode=mode) targs = utils.input_to_feats(os.path.join(file_path, file_name), mode=mode) import pdb pdb.set_trace() # f0_sac = utils.file_to_sac(os.path.join(file_path,file_name)) # f0_sac = (f0_sac-min_feat[-2])/(max_feat[-2]-min_feat[-2]) in_batches, nchunks_in = utils.generate_overlapadd(mix_stft) in_batches = in_batches / max_mix # in_batches = utils.normalize(in_batches, 'mix_stft', mode=config.norm_mode_in) val_outer = [] first_pred = [] cleaner = [] gan_op = [] for in_batch in in_batches: val_harm, val_ap, val_f0, val_vuv = sess.run( [harm, ap, f0, vuv], feed_dict={input_placeholder: in_batch}) if config.use_gan: val_op = sess.run(gen_op, feed_dict={input_placeholder: in_batch}) gan_op.append(val_op) # first_pred.append(harm1) # cleaner.append(val_harm) val_harm = val_harm val_outs = np.concatenate((val_harm, val_ap, val_f0, val_vuv), axis=-1) val_outer.append(val_outs) val_outer = np.array(val_outer) val_outer = utils.overlapadd(val_outer, nchunks_in) val_outer[:, -1] = np.round(val_outer[:, -1]) val_outer = val_outer[:targs.shape[0], :] val_outer = np.clip(val_outer, 0.0, 1.0) import pdb pdb.set_trace() #Test purposes only # first_pred = np.array(first_pred) # first_pred = utils.overlapadd(first_pred, nchunks_in) # cleaner = np.array(cleaner) # cleaner = utils.overlapadd(cleaner, nchunks_in) if config.use_gan: gan_op = np.array(gan_op) gan_op = utils.overlapadd(gan_op, nchunks_in) targs = (targs - min_feat) / (max_feat - min_feat) # first_pred = (first_pred-min_feat[:60])/(max_feat[:60]-min_feat[:60]) # cleaner = (cleaner-min_feat[:60])/(max_feat[:60]-min_feat[:60]) # ax1 = plt.subplot(311) # plt.imshow(targs[:,:60].T, origin='lower', aspect='auto') # # ax1.set_title("Harmonic Spectral Envelope", fontsize = 10) # ax2 = plt.subplot(312) # plt.imshow(targs[:,60:64].T, origin='lower', aspect='auto') # # ax2.set_title("Aperiodicity Envelope", fontsize = 10) # ax3 = plt.subplot(313) # plt.plot(targs[:,-2]) # ax3.set_title("Fundamental Frequency Contour", fontsize = 10) if show_plots: # import pdb;pdb.set_trace() ins = val_outer[:, :60] outs = targs[:, :60] plt.figure(1) ax1 = plt.subplot(211) plt.imshow(ins.T, origin='lower', aspect='auto') ax1.set_title("Predicted Harm ", fontsize=10) ax2 = plt.subplot(212) plt.imshow(outs.T, origin='lower', aspect='auto') ax2.set_title("Ground Truth Harm ", fontsize=10) # ax1 = plt.subplot(413) # plt.imshow(first_pred.T, origin='lower', aspect='auto') # ax1.set_title("Initial Prediction ", fontsize = 10) # ax2 = plt.subplot(412) # plt.imshow(cleaner.T, origin='lower', aspect='auto') # ax2.set_title("Residual Added ", fontsize = 10) if config.use_gan: plt.figure(5) ax1 = plt.subplot(411) plt.imshow(ins.T, origin='lower', aspect='auto') ax1.set_title("Predicted Harm ", fontsize=10) ax2 = plt.subplot(414) plt.imshow(outs.T, origin='lower', aspect='auto') ax2.set_title("Ground Truth Harm ", fontsize=10) ax1 = plt.subplot(412) plt.imshow(gan_op.T, origin='lower', aspect='auto') ax1.set_title("GAN output ", fontsize=10) ax1 = plt.subplot(413) plt.imshow((gan_op[:ins.shape[0], :] + ins).T, origin='lower', aspect='auto') ax1.set_title("GAN output ", fontsize=10) plt.figure(2) ax1 = plt.subplot(211) plt.imshow(val_outer[:, 60:-2].T, origin='lower', aspect='auto') ax1.set_title("Predicted Aperiodic Part", fontsize=10) ax2 = plt.subplot(212) plt.imshow(targs[:, 60:-2].T, origin='lower', aspect='auto') ax2.set_title("Ground Truth Aperiodic Part", fontsize=10) plt.figure(3) f0_output = val_outer[:, -2] * ( (max_feat[-2] - min_feat[-2]) + min_feat[-2]) f0_output = f0_output * (1 - targs[:, -1]) f0_output[f0_output == 0] = np.nan plt.plot(f0_output, label="Predicted Value") f0_gt = targs[:, -2] * ( (max_feat[-2] - min_feat[-2]) + min_feat[-2]) f0_gt = f0_gt * (1 - targs[:, -1]) f0_gt[f0_gt == 0] = np.nan plt.plot(f0_gt, label="Ground Truth") f0_difference = np.nan_to_num(abs(f0_gt - f0_output)) f0_greater = np.where(f0_difference > config.f0_threshold) diff_per = f0_greater[0].shape[0] / len(f0_output) plt.suptitle("Percentage correct = " + '{:.3%}'.format(1 - diff_per)) # import pdb;pdb.set_trace() # import pdb;pdb.set_trace() # uu = f0_sac[:,0]*(1-f0_sac[:,1]) # uu[uu == 0] = np.nan # plt.plot(uu, label="Sac f0") plt.legend() plt.figure(4) ax1 = plt.subplot(211) plt.plot(val_outer[:, -1]) ax1.set_title("Predicted Voiced/Unvoiced", fontsize=10) ax2 = plt.subplot(212) plt.plot(targs[:, -1]) ax2.set_title("Ground Truth Voiced/Unvoiced", fontsize=10) plt.show() if save_file: val_outer = np.ascontiguousarray(val_outer * (max_feat - min_feat) + min_feat) targs = np.ascontiguousarray(targs * (max_feat - min_feat) + min_feat) # import pdb;pdb.set_trace() # val_outer = np.ascontiguousarray(utils.denormalize(val_outer,'feats', mode=config.norm_mode_out)) try: utils.feats_to_audio(val_outer, file_name[:-4] + '_synth_pred_f0') print("File saved to %s" % config.val_dir + file_name[:-4] + '_synth_pred_f0.wav') except: print("Couldn't synthesize with predicted f0") try: val_outer[:, -2:] = targs[:, -2:] utils.feats_to_audio(val_outer, file_name[:-4] + '_synth_ori_f0') print("File saved to %s" % config.val_dir + file_name[:-4] + '_synth_ori_f0.wav') except: print("Couldn't synthesize with original f0")
def synth_file(file_name="nus_MCUR_sing_10.hdf5", singer_index=0, file_path=config.wav_dir, show_plots=True, save_file="GBO"): stat_file = h5py.File(config.stat_dir + 'stats.hdf5', mode='r') max_feat = np.array(stat_file["feats_maximus"]) min_feat = np.array(stat_file["feats_minimus"]) with tf.Graph().as_default(): input_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len, 66), name='input_placeholder') tf.summary.histogram('inputs', input_placeholder) output_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len, 64), name='output_placeholder') f0_input_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len, 1), name='f0_input_placeholder') rand_input_placeholder = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len, 4), name='rand_input_placeholder') prob = tf.placeholder_with_default(1.0, shape=()) phoneme_labels = tf.placeholder(tf.int32, shape=(config.batch_size, config.max_phr_len), name='phoneme_placeholder') phone_onehot_labels = tf.one_hot(indices=tf.cast( phoneme_labels, tf.int32), depth=42) phoneme_labels_2 = tf.placeholder(tf.float32, shape=(config.batch_size, config.max_phr_len, 42), name='phoneme_placeholder_1') # phone_onehot_labels = tf.one_hot(indices=tf.cast(phoneme_labels, tf.int32), depth=42) singer_labels = tf.placeholder(tf.float32, shape=(config.batch_size), name='singer_placeholder') singer_onehot_labels = tf.one_hot(indices=tf.cast( singer_labels, tf.int32), depth=12) with tf.variable_scope('phone_Model') as scope: # regularizer = tf.contrib.layers.l2_regularizer(scale=0.1) pho_logits = modules.phone_network(input_placeholder) pho_classes = tf.argmax(pho_logits, axis=-1) pho_probs = tf.nn.softmax(pho_logits) with tf.variable_scope('Final_Model') as scope: voc_output = modules.final_net(singer_onehot_labels, f0_input_placeholder, phoneme_labels_2) voc_output_decoded = tf.nn.sigmoid(voc_output) scope.reuse_variables() voc_output_3 = modules.final_net(singer_onehot_labels, f0_input_placeholder, pho_probs) voc_output_3_decoded = tf.nn.sigmoid(voc_output_3) # scope.reuse_variables() # voc_output_gen = modules.final_net(singer_onehot_labels, f0_input_placeholder, pho_probs) # voc_output_decoded_gen = tf.nn.sigmoid(voc_output_gen) # with tf.variable_scope('singer_Model') as scope: # singer_embedding, singer_logits = modules.singer_network(input_placeholder, prob) # singer_classes = tf.argmax(singer_logits, axis=-1) # singer_probs = tf.nn.softmax(singer_logits) with tf.variable_scope('Generator') as scope: voc_output_2 = modules.GAN_generator(singer_onehot_labels, phoneme_labels_2, f0_input_placeholder, rand_input_placeholder) with tf.variable_scope('Discriminator') as scope: D_fake = modules.GAN_discriminator(voc_output_2, singer_onehot_labels, phone_onehot_labels, f0_input_placeholder) saver = tf.train.Saver(max_to_keep=config.max_models_to_keep) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess = tf.Session() sess.run(init_op) ckpt = tf.train.get_checkpoint_state(config.log_dir) if ckpt and ckpt.model_checkpoint_path: print("Using the model in %s" % ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) # saver.restore(sess, './log/model.ckpt-3999') # import pdb;pdb.set_trace() voc_file = h5py.File(config.voice_dir + file_name, "r") # speaker_file = h5py.File(config.voice_dir+speaker_file, "r") feats = np.array(voc_file['feats']) # feats = utils.input_to_feats('./54228_chorus.wav_ori_vocals.wav', mode = 1) f0 = feats[:, -2] # import pdb;pdb.set_trace() med = np.median(f0[f0 > 0]) f0[f0 == 0] = med f0 = f0 - 12 feats[:, -2] = feats[:, -2] - 12 f0_nor = (f0 - min_feat[-2]) / (max_feat[-2] - min_feat[-2]) feats = (feats - min_feat) / (max_feat - min_feat) pho_target = np.array(voc_file["phonemes"]) in_batches_f0, nchunks_in = utils.generate_overlapadd( f0_nor.reshape(-1, 1)) in_batches_pho, nchunks_in_pho = utils.generate_overlapadd( pho_target.reshape(-1, 1)) in_batches_feat, kaka = utils.generate_overlapadd(feats) # import pdb;pdb.set_trace() out_batches_feats = [] out_batches_feats_1 = [] out_batches_feats_gan = [] for in_batch_f0, in_batch_pho_target, in_batch_feat in zip( in_batches_f0, in_batches_pho, in_batches_feat): in_batch_f0 = in_batch_f0.reshape( [config.batch_size, config.max_phr_len, 1]) in_batch_pho_target = in_batch_pho_target.reshape( [config.batch_size, config.max_phr_len]) # in_batch_pho_target = sess.run(pho_probs, feed_dict = {input_placeholder: in_batch_feat}) output_feats, output_feats_1, output_feats_gan = sess.run( [voc_output_decoded, voc_output_3_decoded, voc_output_2], feed_dict={ input_placeholder: in_batch_feat, f0_input_placeholder: in_batch_f0, phoneme_labels_2: in_batch_pho_target, singer_labels: np.ones(30) * singer_index, rand_input_placeholder: np.random.normal(-1.0, 1.0, size=[30, config.max_phr_len, 4]) }) out_batches_feats.append(output_feats) out_batches_feats_1.append(output_feats_1) out_batches_feats_gan.append(output_feats_gan / 2 + 0.5) # out_batches_voc_stft_phase.append(output_voc_stft_phase) # import pdb;pdb.set_trace() out_batches_feats = np.array(out_batches_feats) # import pdb;pdb.set_trace() out_batches_feats = utils.overlapadd(out_batches_feats, nchunks_in) out_batches_feats_1 = np.array(out_batches_feats_1) # import pdb;pdb.set_trace() out_batches_feats_1 = utils.overlapadd(out_batches_feats_1, nchunks_in) out_batches_feats_gan = np.array(out_batches_feats_gan) # import pdb;pdb.set_trace() out_batches_feats_gan = utils.overlapadd(out_batches_feats_gan, nchunks_in) feats = feats * (max_feat - min_feat) + min_feat out_batches_feats = out_batches_feats * (max_feat[:-2] - min_feat[:-2]) + min_feat[:-2] out_batches_feats_1 = out_batches_feats_1 * ( max_feat[:-2] - min_feat[:-2]) + min_feat[:-2] out_batches_feats_gan = out_batches_feats_gan * ( max_feat[:-2] - min_feat[:-2]) + min_feat[:-2] out_batches_feats = out_batches_feats[:len(feats)] out_batches_feats_1 = out_batches_feats_1[:len(feats)] out_batches_feats_gan = out_batches_feats_gan[:len(feats)] first_op = np.concatenate([out_batches_feats, feats[:, -2:]], axis=-1) pho_op = np.concatenate([out_batches_feats_1, feats[:, -2:]], axis=-1) gan_op = np.concatenate([out_batches_feats_gan, feats[:, -2:]], axis=-1) # import pdb;pdb.set_trace() gan_op = np.ascontiguousarray(gan_op) pho_op = np.ascontiguousarray(pho_op) first_op = np.ascontiguousarray(first_op) if show_plots: plt.figure(1) ax1 = plt.subplot(311) plt.imshow(feats[:, :60].T, aspect='auto', origin='lower') ax1.set_title("Ground Truth Vocoder Features", fontsize=10) ax2 = plt.subplot(312, sharex=ax1, sharey=ax1) plt.imshow(out_batches_feats[:, :60].T, aspect='auto', origin='lower') ax2.set_title("Cross Entropy Output Vocoder Features", fontsize=10) ax3 = plt.subplot(313, sharex=ax1, sharey=ax1) ax3.set_title("GAN Vocoder Output Features", fontsize=10) # plt.imshow(out_batches_feats_1[:,:60].T,aspect='auto',origin='lower') # # plt.subplot(414, sharex = ax1, sharey = ax1) plt.imshow(out_batches_feats_gan[:, :60].T, aspect='auto', origin='lower') plt.figure(2) plt.subplot(211) plt.imshow(feats[:, 60:-2].T, aspect='auto', origin='lower') plt.subplot(212) plt.imshow(out_batches_feats[:, -4:].T, aspect='auto', origin='lower') plt.show() save_file = input( "Which files to synthesise G for GAN, B for Binary Entropy, " "O for original, or any combination. Default is None").upper( ) or "N" else: save_file = input( "Which files to synthesise G for GAN, B for Binary Entropy, " "O for original, or any combination. Default is all (GBO)" ).upper() or "GBO" if "G" in save_file: utils.feats_to_audio(gan_op[:, :], file_name[:-4] + 'gan_op.wav') print("GAN file saved to {}".format( os.path.join(config.val_dir, file_name[:-4] + 'gan_op.wav'))) if "O" in save_file: utils.feats_to_audio(feats[:, :], file_name[:-4] + 'ori_op.wav') print("Originl file, resynthesized via WORLD vocoder saved to {}". format( os.path.join(config.val_dir, file_name[:-4] + 'ori_op.wav'))) # if "B" in save_file: # # utils.feats_to_audio(pho_op[:5000,:],file_name[:-4]+'phoop.wav') # utils.feats_to_audio(first_op[:, :], file_name[:-4] + 'bce_op.wav') print("Binar cross entropy file saved to {}".format( os.path.join(config.val_dir, file_name[:-4] + 'bce_op.wav')))
def evalNets( pcs_model='model_e8000_b50_bs5_3429', file_to_eval="None", path='/home/pc2752/share/JoanMaster/PytorchConvSep/data_h5py_test'): autoencoder_audio = AutoEncoder().cuda() autoencoder_audio.load_state_dict( torch.load(config.log_dir + pcs_model + '.pt')) stat_file = h5py.File(config.stat_dir + 'stats.hdf5', mode='r') max_feat = np.array(stat_file["feats_maximus"]) min_feat = np.array(stat_file["feats_minimus"]) max_feat_tars = max_feat[:8, :].reshape(8, 1, 513) min_feat_tars = min_feat[:8, :].reshape(8, 1, 513) max_feat_ins = max_feat[-2:, :].reshape(2, 1, 513) min_feat_ins = min_feat[-2:, :].reshape(2, 1, 513) wav_files = [ x for x in os.listdir(config.wav_dir_test) if x.endswith('.stem.mp4') and not x.startswith(".") ] random_files = [random.choice(wav_files) for x in range(50)] file_length = int(44100 * 6) SDR_error = [] SIR_error = [] SAR_error = [] ISR_error = [] for file_name in random_files: audio, fs = stempeg.read_stems(os.path.join(config.wav_dir_test, file_name), stem_id=[0, 1, 2, 3, 4]) mixture = audio[0] drums = audio[1] bass = audio[2] acc = audio[3] vocals = audio[4] mix_stft, mix_phase = utils.stft_stereo(mixture, phase=True) mix_stft = (mix_stft - min_feat_ins) / (max_feat_ins - min_feat_ins) drums_stft = utils.stft_stereo(drums) bass_stft = utils.stft_stereo(bass) acc_stft = utils.stft_stereo(acc) voc_stft = utils.stft_stereo(vocals) in_batches, nchunks_in = utils.generate_overlapadd(mix_stft) out_batches = [] for in_batch in in_batches: # import pdb;pdb.set_trace() in_batch = Variable(torch.FloatTensor(in_batch)).cuda() out_batch = autoencoder_audio(in_batch) out_batches.append(np.array(out_batch.data.cpu().numpy())) out_batches = np.array(out_batches) vocals = out_batches[:, :, :2, :, :] drums = out_batches[:, :, 2:4, :, :] bass = out_batches[:, :, 4:6, :, :] others = out_batches[:, :, 6:, :, :] total_sources = vocals + bass + drums + others mask_vocals = vocals / total_sources mask_drums = drums / total_sources mask_bass = bass / total_sources mask_others = 1 - (mask_vocals + mask_drums + mask_bass) out_vocals = in_batches * mask_vocals out_drums = in_batches * mask_drums out_bass = in_batches * mask_bass out_others = in_batches * mask_others out_vocals = out_vocals * ( max_feat_tars[:2, :, :] - min_feat_tars[:2, :, :]) + min_feat_tars[:2, :, :] out_drums = out_drums * (max_feat_tars[2:4, :, :] - min_feat_tars[ 2:4, :, :]) + min_feat_tars[2:4, :, :] out_bass = out_bass * (max_feat_tars[4:6, :, :] - min_feat_tars[ 4:6, :, :]) + min_feat_tars[4:6, :, :] out_others = out_others * (max_feat_tars[6:, :, :] - min_feat_tars[ 6:, :, :]) + min_feat_tars[6:, :, :] out_drums = utils.overlapadd(out_drums, nchunks_in) out_bass = utils.overlapadd(out_bass, nchunks_in) out_others = utils.overlapadd(out_others, nchunks_in) out_vocals = utils.overlapadd(out_vocals, nchunks_in) out_drums = utils.inverse_stft(out_drums[:, :mix_phase.shape[1], :], mix_phase) out_bass = utils.inverse_stft(out_bass[:, :mix_phase.shape[1], :], mix_phase) out_others = utils.inverse_stft(out_others[:, :mix_phase.shape[1], :], mix_phase) out_vocals = utils.inverse_stft(out_vocals[:, :mix_phase.shape[1], :], mix_phase) estimated = np.transpose( np.concatenate((out_drums, out_bass, out_others, out_vocals), axis=1)) zero_pad_drums = np.zeros( [abs(audio[1].shape[0] - out_drums.shape[0]), 2]) zero_pad_bass = np.zeros( [abs(audio[2].shape[0] - out_bass.shape[0]), 2]) zero_pad_others = np.zeros( [abs(audio[3].shape[0] - out_others.shape[0]), 2]) zero_pad_vocals = np.zeros( [abs(audio[4].shape[0] - out_vocals.shape[0]), 2]) target_drums = np.append(audio[1], zero_pad_drums, 0) target_bass = np.append(audio[2], zero_pad_bass, 0) target_others = np.append(audio[3], zero_pad_others, 0) target_vocals = np.append(audio[4], zero_pad_vocals, 0) targets = np.transpose( np.concatenate( (target_vocals, target_drums, target_bass, target_others), axis=1)) index = np.random.randint(0, target_vocals.shape[0] - file_length) #import pdb;pdb.set_trace() targets_no_zero = targets[:, index:index + file_length] targets_no_zero[targets_no_zero == 0] = 1e-8 estimated_no_zero = estimated[:, index:index + file_length] estimated_no_zero[estimated_no_zero == 0] = 1e-8 [SDR, ISR, SIR, SAR, _] = mir_eval.separation.bss_eval_images(targets_no_zero, estimated_no_zero) SDR_error.append(SDR) SAR_error.append(SAR) SIR_error.append(SIR) ISR_error.append(ISR) for sdr in SDR: print(sdr) if not np.isnan(SDR).any(): np.save(config.err_dir + 'SDR_error', np.array(SDR_error)) np.save(config.err_dir + 'SAR_error', np.array(SAR_error)) np.save(config.err_dir + 'SIR_error', np.array(SIR_error)) np.save(config.err_dir + 'ISR_error', np.array(ISR_error))