def process(raw_data, n_frames, n_channels, sample_width, sample_rate): # Model model = Model() global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step') # channels is int32 type channels = interpret_wav(raw_data, n_frames, n_channels, sample_width, True) mixed_wav = channels[0] with tf.Session(config=EvalConfig.session_conf) as sess: # Initialized, Load state sess.run(tf.global_variables_initializer()) model.load_state(sess, EvalConfig.CKPT_PATH) mixed_spec = to_spectrogram(mixed_wav) mixed_mag = get_magnitude(mixed_spec) mixed_batch, padded_mixed_mag = model.spec_to_batch(mixed_mag) mixed_phase = get_phase(mixed_spec) (pred_src1_mag, pred_src2_mag) = sess.run(model(), feed_dict={model.x_mixed: mixed_batch}) seq_len = mixed_phase.shape[-1] pred_src1_mag = model.batch_to_spec(pred_src1_mag, EvalConfig.NUM_EVAL)[:, :, :seq_len] pred_src2_mag = model.batch_to_spec(pred_src2_mag, EvalConfig.NUM_EVAL)[:, :, :seq_len] # Time-frequency masking mask_src1 = soft_time_freq_mask(pred_src1_mag, pred_src2_mag) mask_src2 = 1. - mask_src1 pred_src2_mag = mixed_mag * mask_src2 pred_src2_wav = to_wav(pred_src2_mag, mixed_phase) processed_signal.put(pred_src2_wav[0]) return pred_src2_wav[0].astype(channels.dtype)
def process(input_signal, processed_signal): # Model model = Model() global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step') with tf.Session(config=EvalConfig.session_conf) as sess: # Initialized, Load state sess.run(tf.global_variables_initializer()) model.load_state(sess, EvalConfig.CKPT_PATH) while (True): start = time.time() mixed_wav = input_signal.get() mixed_spec = to_spectrogram(mixed_wav) mixed_mag = get_magnitude(mixed_spec) mixed_batch, padded_mixed_mag = model.spec_to_batch(mixed_mag) mixed_phase = get_phase(mixed_spec) assert (np.all( np.equal(model.batch_to_spec(mixed_batch, EvalConfig.NUM_EVAL), padded_mixed_mag))) input_signal.task_done() middle = time.time() (pred_src1_mag, pred_src2_mag) = sess.run(model(), feed_dict={model.x_mixed: mixed_batch}) modeltime = time.time() seq_len = mixed_phase.shape[-1] pred_src1_mag = model.batch_to_spec( pred_src1_mag, EvalConfig.NUM_EVAL)[:, :, :seq_len] pred_src2_mag = model.batch_to_spec( pred_src2_mag, EvalConfig.NUM_EVAL)[:, :, :seq_len] middle2 = time.time() # Time-frequency masking mask_src1 = soft_time_freq_mask(pred_src1_mag, pred_src2_mag) mask_src2 = 1. - mask_src1 pred_src1_mag = mixed_mag * mask_src1 pred_src2_mag = mixed_mag * mask_src2 # (magnitude, phase) -> spectrogram -> wav pred_src1_wav = to_wav(pred_src1_mag, mixed_phase) pred_src2_wav = to_wav(pred_src2_mag, mixed_phase) processed_signal.put(pred_src2_wav[0]) end = time.time() print("Process time1 = {0}".format(middle - start)) print("Process time to start model = {0}".format(modeltime - middle)) print("Process time to models = {0}".format(middle2 - modeltime)) print("Process time till end = {0}".format(end - middle2))
def eval(model, eval_data, sess): mixed_wav, src1_wav, src2_wav, _ = eval_data.next_wavs( EvalConfig.SECONDS, EvalConfig.NUM_EVAL) mixed_spec = to_spectrogram(mixed_wav) mixed_mag = get_magnitude(mixed_spec) src1_spec, src2_spec = to_spectrogram(src1_wav), to_spectrogram(src2_wav) src1_mag, src2_mag = get_magnitude(src1_spec), get_magnitude(src2_spec) src1_batch, _ = model.spec_to_batch(src1_mag) src2_batch, _ = model.spec_to_batch(src2_mag) mixed_batch, _ = model.spec_to_batch(mixed_mag) pred_src1_mag, pred_src2_mag = sess.run( model(), feed_dict={model.x_mixed: mixed_batch}) mixed_phase = get_phase(mixed_spec) seq_len = mixed_phase.shape[-1] pred_src1_mag = model.batch_to_spec(pred_src1_mag, EvalConfig.NUM_EVAL)[:, :, :seq_len] pred_src2_mag = model.batch_to_spec(pred_src2_mag, EvalConfig.NUM_EVAL)[:, :, :seq_len] # Time-frequency masking mask_src1 = soft_time_freq_mask(pred_src1_mag, pred_src2_mag) # mask_src1 = hard_time_freq_mask(pred_src1_mag, pred_src2_mag) mask_src2 = 1. - mask_src1 pred_src1_mag = mixed_mag * mask_src1 pred_src2_mag = mixed_mag * mask_src2 # (magnitude, phase) -> spectrogram -> wav if EvalConfig.GRIFFIN_LIM: pred_src1_wav = to_wav_mag_only(pred_src1_mag, init_phase=mixed_phase, num_iters=EvalConfig.GRIFFIN_LIM_ITER) pred_src2_wav = to_wav_mag_only(pred_src2_mag, init_phase=mixed_phase, num_iters=EvalConfig.GRIFFIN_LIM_ITER) else: pred_src1_wav = to_wav(pred_src1_mag, mixed_phase) pred_src2_wav = to_wav(pred_src2_mag, mixed_phase) # Compute BSS metrics gnsdr, gsir, gsar = bss_eval_global(mixed_wav, src1_wav, src2_wav, pred_src1_wav, pred_src2_wav, EvalConfig.NUM_EVAL) return gnsdr, gsir, gsar
def test_run(): tf.reset_default_graph() model = Model() with tf.Session(config=EvalConfig.session_conf) as sess: # Initialized, Load state sess.run(tf.global_variables_initializer()) data = Datas(EvalConfig.DATA_PATH) model.load_state(sess, EvalConfig.CKPT_PATH) mixed_wav, src1_wav, src2_wav = data.next_wav(EvalConfig.SECONDS) print(src1_wav) mixed_spec = to_spectrogram(mixed_wav) mixed_mag = get_magnitude(mixed_spec) mixed_batch, padded_mixed_mag = model.spec_to_batch(mixed_mag) mixed_phase = get_phase(mixed_spec) (pred_src1_mag, pred_src2_mag) = sess.run(model(), feed_dict={model.x_mixed: mixed_batch}) seq_len = mixed_phase.shape[-1] pred_src1_mag = model.batch_to_spec( pred_src1_mag, EvalConfig.NUM_EVAL)[:, :, :seq_len] pred_src2_mag = model.batch_to_spec( pred_src2_mag, EvalConfig.NUM_EVAL)[:, :, :seq_len] # Time-frequency masking mask_src1 = soft_time_freq_mask(pred_src1_mag, pred_src2_mag) # mask_src1 = hard_time_freq_mask(pred_src1_mag, pred_src2_mag) mask_src2 = 1. - mask_src1 pred_src1_mag = mixed_mag * mask_src1 pred_src2_mag = mixed_mag * mask_src2 pred_src1_wav = to_wav(pred_src1_mag, mixed_phase) pred_src2_wav = to_wav(pred_src2_mag, mixed_phase) write_wav(mixed_wav[0], '{}/{}'.format(EvalConfig.RESULT_PATH, 'original')) write_wav(pred_src1_wav[0], '{}/{}'.format(EvalConfig.RESULT_PATH, 'music')) write_wav(pred_src2_wav[0], '{}/{}'.format(EvalConfig.RESULT_PATH, 'voice'))
def get_drum(filename): tf.reset_default_graph() model = Model() with tf.Session(config=EvalConfig.session_conf) as sess: # Initialized, Load state sess.run(tf.global_variables_initializer()) data = Datas(RunConfig.DATA_ROOT) model.load_state(sess, EvalConfig.CKPT_PATH) mixed_wav = data.get_mixture(filename) print(mixed_wav) mixed_spec = to_spectrogram(mixed_wav) mixed_mag = get_magnitude(mixed_spec) mixed_batch, padded_mixed_mag = model.spec_to_batch(mixed_mag) mixed_phase = get_phase(mixed_spec) (pred_src1_mag, pred_src2_mag) = sess.run(model(), feed_dict={model.x_mixed: mixed_batch}) seq_len = mixed_phase.shape[-1] pred_src1_mag = model.batch_to_spec( pred_src1_mag, EvalConfig.NUM_EVAL)[:, :, :seq_len] pred_src2_mag = model.batch_to_spec( pred_src2_mag, EvalConfig.NUM_EVAL)[:, :, :seq_len] # Time-frequency masking mask_src1 = soft_time_freq_mask(pred_src1_mag, pred_src2_mag) # mask_src1 = hard_time_freq_mask(pred_src1_mag, pred_src2_mag) mask_src2 = 1. - mask_src1 pred_src2_mag = mixed_mag * mask_src2 pred_src2_wav = to_wav(pred_src2_mag, mixed_phase) filename = filename.replace('.wav', '') write_wav(pred_src2_wav[0], '{}/{}'.format(RunConfig.RESULT_PATH, filename))
def eval(data_path=None, result_path=None): # Model model = Model() global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step') with tf.Session(config=EvalConfig.session_conf) as sess: # Initialized, Load state sess.run(tf.global_variables_initializer()) model.load_state(sess, EvalConfig.CKPT_PATH) writer = tf.summary.FileWriter(EvalConfig.GRAPH_PATH, sess.graph) data = Data(data_path) if data_path else Data(EvalConfig.DATA_PATH) output_path = result_path if result_path else EvalConfig.RESULT_PATH mixed_wav, src1_wav, src2_wav, wavfiles = data.next_wavs( EvalConfig.SECONDS, EvalConfig.NUM_EVAL) mixed_spec = to_spectrogram(mixed_wav) mixed_mag = get_magnitude(mixed_spec) mixed_batch, padded_mixed_mag = model.spec_to_batch(mixed_mag) mixed_phase = get_phase(mixed_spec) assert (np.all( np.equal(model.batch_to_spec(mixed_batch, EvalConfig.NUM_EVAL), padded_mixed_mag))) (pred_src1_mag, pred_src2_mag) = sess.run(model(), feed_dict={model.x_mixed: mixed_batch}) seq_len = mixed_phase.shape[-1] pred_src1_mag = model.batch_to_spec( pred_src1_mag, EvalConfig.NUM_EVAL)[:, :, :seq_len] pred_src2_mag = model.batch_to_spec( pred_src2_mag, EvalConfig.NUM_EVAL)[:, :, :seq_len] # Time-frequency masking mask_src1 = soft_time_freq_mask(pred_src1_mag, pred_src2_mag) # mask_src1 = hard_time_freq_mask(pred_src1_mag, pred_src2_mag) mask_src2 = 1. - mask_src1 pred_src1_mag = mixed_mag * mask_src1 pred_src2_mag = mixed_mag * mask_src2 # (magnitude, phase) -> spectrogram -> wav if EvalConfig.GRIFFIN_LIM: pred_src1_wav = to_wav_mag_only( pred_src1_mag, init_phase=mixed_phase, num_iters=EvalConfig.GRIFFIN_LIM_ITER) pred_src2_wav = to_wav_mag_only( pred_src2_mag, init_phase=mixed_phase, num_iters=EvalConfig.GRIFFIN_LIM_ITER) else: pred_src1_wav = to_wav(pred_src1_mag, mixed_phase) pred_src2_wav = to_wav(pred_src2_mag, mixed_phase) # Write the result tf.summary.audio('GT_mixed', mixed_wav, ModelConfig.SR, max_outputs=EvalConfig.NUM_EVAL) tf.summary.audio('Pred_music', pred_src1_wav, ModelConfig.SR, max_outputs=EvalConfig.NUM_EVAL) tf.summary.audio('Pred_vocal', pred_src2_wav, ModelConfig.SR, max_outputs=EvalConfig.NUM_EVAL) if EvalConfig.EVAL_METRIC: # Compute BSS metrics gnsdr, gsir, gsar = bss_eval_global(mixed_wav, src1_wav, src2_wav, pred_src1_wav, pred_src2_wav) # Write the score of BSS metrics tf.summary.scalar('GNSDR_music', gnsdr[0]) tf.summary.scalar('GSIR_music', gsir[0]) tf.summary.scalar('GSAR_music', gsar[0]) tf.summary.scalar('GNSDR_vocal', gnsdr[1]) tf.summary.scalar('GSIR_vocal', gsir[1]) tf.summary.scalar('GSAR_vocal', gsar[1]) if EvalConfig.WRITE_RESULT: # Write the result for i in range(len(wavfiles)): name = 'video' print output_path write_wav(mixed_wav[i], '{}/{}-{}'.format(output_path, name, 'original')) write_wav(pred_src1_wav[i], '{}/{}-{}'.format(output_path, name, 'music')) write_wav(pred_src2_wav[i], '{}/{}-{}'.format(output_path, name, 'voice')) writer.add_summary(sess.run(tf.summary.merge_all()), global_step=global_step.eval()) writer.close()
def process(input_signal, p): # Model model = Model() global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step') CHUNKSIZE = EvalConfig.CHUNK stream = p.open(format=pyaudio.paInt32, channels=1, rate=48000, output=True, frames_per_buffer=CHUNKSIZE) with tf.Session(config=EvalConfig.session_conf) as sess: # Initialized, Load state sess.run(tf.global_variables_initializer()) model.load_state(sess, EvalConfig.CKPT_PATH) while (True): #p_sem.acquire() #print(" process before get: %d" % input_signal.qsize()) mixed_wav = input_signal.get() mixed_spec = to_spectrogram(mixed_wav) mixed_mag = get_magnitude(mixed_spec) mixed_batch, padded_mixed_mag = model.spec_to_batch(mixed_mag) mixed_phase = get_phase(mixed_spec) assert (np.all( np.equal(model.batch_to_spec(mixed_batch, EvalConfig.NUM_EVAL), padded_mixed_mag))) (pred_src1_mag, pred_src2_mag) = sess.run(model(), feed_dict={model.x_mixed: mixed_batch}) seq_len = mixed_phase.shape[-1] pred_src1_mag = model.batch_to_spec( pred_src1_mag, EvalConfig.NUM_EVAL)[:, :, :seq_len] pred_src2_mag = model.batch_to_spec( pred_src2_mag, EvalConfig.NUM_EVAL)[:, :, :seq_len] # Time-frequency masking mask_src1 = soft_time_freq_mask(pred_src1_mag, pred_src2_mag) mask_src2 = 1. - mask_src1 pred_src1_mag = mixed_mag * mask_src1 pred_src2_mag = mixed_mag * mask_src2 pred_src2_wav = to_wav(pred_src2_mag, mixed_phase) # free = stream.get_write_available() # print(" free1: %d" % free) # if (free - CHUNKSIZE) > CHUNKSIZE * 2: # sleep(0.5) data = pred_src2_wav[0].astype(np.int32).tostring() stream.write(data) # free = stream.get_write_available() # print(" free2: %d" % free) #r_sem.release() stream.stop_stream() stream.close()
def eval(): # Model model = Model() global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step') with tf.Session(config=EvalConfig.session_conf) as sess: # Initialized, Load state sess.run(tf.global_variables_initializer()) model.load_state(sess, EvalConfig.CKPT_PATH) writer = tf.summary.FileWriter(EvalConfig.GRAPH_PATH, sess.graph) data = Data(EvalConfig.DATA_PATH, TrainConfig.NOISE_DATA_PATH, TrainConfig.VOICE_DATA_PATH) mixed_wav, src1_wav, src2_wav, wavfiles = data.next_wavs_eval( EvalConfig.SECONDS, EvalConfig.NUM_EVAL) start = time.time() mixed_spec = to_spectrogram(mixed_wav) mixed_mag = get_magnitude(mixed_spec) mixed_batch, padded_mixed_mag = model.spec_to_batch(mixed_mag) mixed_phase = get_phase(mixed_spec) assert (np.all( np.equal(model.batch_to_spec(mixed_batch, EvalConfig.NUM_EVAL), padded_mixed_mag))) (pred_src1_mag, pred_src2_mag) = sess.run(model(), feed_dict={model.x_mixed: mixed_batch}) seq_len = mixed_phase.shape[-1] pred_src1_mag = model.batch_to_spec( pred_src1_mag, EvalConfig.NUM_EVAL)[:, :, :seq_len] pred_src2_mag = model.batch_to_spec( pred_src2_mag, EvalConfig.NUM_EVAL)[:, :, :seq_len] # Time-frequency masking mask_src1 = soft_time_freq_mask(pred_src1_mag, pred_src2_mag) # mask_src1 = hard_time_freq_mask(pred_src1_mag, pred_src2_mag) mask_src2 = 1. - mask_src1 pred_src1_mag = mixed_mag * mask_src1 pred_src2_mag = mixed_mag * mask_src2 # (magnitude, phase) -> spectrogram -> wav if EvalConfig.GRIFFIN_LIM: pred_src1_wav = to_wav_mag_only( pred_src1_mag, init_phase=mixed_phase, num_iters=EvalConfig.GRIFFIN_LIM_ITER) pred_src2_wav = to_wav_mag_only( pred_src2_mag, init_phase=mixed_phase, num_iters=EvalConfig.GRIFFIN_LIM_ITER) else: pred_src1_wav = to_wav(pred_src1_mag, mixed_phase) pred_src2_wav = to_wav(pred_src2_mag, mixed_phase) end = time.time() print("Time elapsed: {0}".format(end - start)) # Write the result tf.summary.audio('GT_mixed', mixed_wav, ModelConfig.SR, max_outputs=EvalConfig.NUM_EVAL) tf.summary.audio('Pred_music', pred_src1_wav, ModelConfig.SR, max_outputs=EvalConfig.NUM_EVAL) tf.summary.audio('Pred_vocal', pred_src2_wav, ModelConfig.SR, max_outputs=EvalConfig.NUM_EVAL) # Write the result for i in range(len(wavfiles)): name = wavfiles[i].replace('/', '-').replace('.wav', '') write_wav( mixed_wav[i], '{}/{}-{}'.format(EvalConfig.RESULT_PATH, name, 'original')) write_wav( pred_src1_wav[i], '{}/{}-{}'.format(EvalConfig.RESULT_PATH, name, 'background')) write_wav(pred_src2_wav[i], '{}/{}-{}'.format(EvalConfig.RESULT_PATH, name, 'voice')) writer.add_summary(sess.run(tf.summary.merge_all()), global_step=global_step.eval()) writer.close()
def eval(n): overall_gnsdr, overall_gsir, overall_gsar = [], [], [] for i in range(n): with tf.Graph().as_default(): # Model model = Model(ModelConfig.HIDDEN_LAYERS, ModelConfig.HIDDEN_UNITS) global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step') with tf.Session(config=EvalConfig.session_conf) as sess: # Initialized, Load state sess.run(tf.global_variables_initializer()) model.load_state(sess, EvalConfig.CKPT_PATH) print('num trainable parameters: %s' % (np.sum([ np.prod(v.get_shape().as_list()) for v in tf.trainable_variables() ]))) writer = tf.summary.FileWriter(EvalConfig.GRAPH_PATH, sess.graph) data = Data(EvalConfig.DATA_PATH) mixed_wav, src1_wav, src2_wav, wavfiles = data.next_wavs( EvalConfig.SECONDS, EvalConfig.NUM_EVAL) mixed_spec = to_spectrogram(mixed_wav) mixed_mag = get_magnitude(mixed_spec) mixed_batch, padded_mixed_mag = model.spec_to_batch(mixed_mag) mixed_phase = get_phase(mixed_spec) assert (np.all( np.equal( model.batch_to_spec(mixed_batch, EvalConfig.NUM_EVAL), padded_mixed_mag))) (pred_src1_mag, pred_src2_mag) = sess.run( model(), feed_dict={model.x_mixed: mixed_batch}) seq_len = mixed_phase.shape[-1] pred_src1_mag = model.batch_to_spec( pred_src1_mag, EvalConfig.NUM_EVAL)[:, :, :seq_len] pred_src2_mag = model.batch_to_spec( pred_src2_mag, EvalConfig.NUM_EVAL)[:, :, :seq_len] # Time-frequency masking mask_src1 = soft_time_freq_mask(pred_src1_mag, pred_src2_mag) # mask_src1 = hard_time_freq_mask(pred_src1_mag, pred_src2_mag) mask_src2 = 1. - mask_src1 pred_src1_mag = mixed_mag * mask_src1 pred_src2_mag = mixed_mag * mask_src2 # (magnitude, phase) -> spectrogram -> wav if EvalConfig.GRIFFIN_LIM: pred_src1_wav = to_wav_mag_only( pred_src1_mag, init_phase=mixed_phase, num_iters=EvalConfig.GRIFFIN_LIM_ITER) pred_src2_wav = to_wav_mag_only( pred_src2_mag, init_phase=mixed_phase, num_iters=EvalConfig.GRIFFIN_LIM_ITER) else: pred_src1_wav = to_wav(pred_src1_mag, mixed_phase) pred_src2_wav = to_wav(pred_src2_mag, mixed_phase) # Write the result tf.summary.audio('GT_mixed', mixed_wav, ModelConfig.SR, max_outputs=EvalConfig.NUM_EVAL) tf.summary.audio('Pred_music', pred_src1_wav, ModelConfig.SR, max_outputs=EvalConfig.NUM_EVAL) tf.summary.audio('Pred_vocal', pred_src2_wav, ModelConfig.SR, max_outputs=EvalConfig.NUM_EVAL) if EvalConfig.EVAL_METRIC: # Compute BSS metrics gnsdr, gsir, gsar = bss_eval_global( mixed_wav, src1_wav, src2_wav, pred_src1_wav, pred_src2_wav) # Write the score of BSS metrics tf.summary.scalar('GNSDR_music', gnsdr[0]) tf.summary.scalar('GSIR_music', gsir[0]) tf.summary.scalar('GSAR_music', gsar[0]) tf.summary.scalar('GNSDR_vocal', gnsdr[1]) tf.summary.scalar('GSIR_vocal', gsir[1]) tf.summary.scalar('GSAR_vocal', gsar[1]) print('GNSDR: ', gnsdr) print('GSIR: ', gsir) print('GSAR: ', gsar) overall_gnsdr.append(gnsdr) overall_gsir.append(gsir) overall_gsar.append(gsar) if EvalConfig.WRITE_RESULT: # Write the result for i in range(len(wavfiles)): name = wavfiles[i].replace('/', '-').replace('.wav', '') write_wav( mixed_wav[i], '{}/{}-{}'.format(EvalConfig.RESULT_PATH, name, 'original')) write_wav( pred_src1_wav[i], '{}/{}-{}'.format(EvalConfig.RESULT_PATH, name, 'music')) write_wav( pred_src2_wav[i], '{}/{}-{}'.format(EvalConfig.RESULT_PATH, name, 'voice')) writer.add_summary(sess.run(tf.summary.merge_all()), global_step=global_step.eval()) writer.close() if n > 1: overall_gnsdr = np.array(overall_gnsdr) overall_gsir = np.array(overall_gsir) overall_gsar = np.array(overall_gsar) overall_gnsdr = np.mean(overall_gnsdr, axis=0) overall_gsir = np.mean(overall_gsir, axis=0) overall_gsar = np.mean(overall_gsar, axis=0) print('OVERALL GNSDR: ', overall_gnsdr) print('OVERALL GSIR: ', overall_gsir) print('OVERALL GSAR: ', overall_gsar)
def eval(model, data, sr, len_frame, num_wav, glim, glim_iter, ckpt_path, graph_path, result_path): len_hop = closest_power_of_two(len_frame / 4) global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step') with tf.Session() as sess: if not os.path.exists(result_path): os.makedirs(result_path) # Initialized, Load state sess.run(tf.global_variables_initializer()) model.load_state(sess, ckpt_path) writer = tf.summary.FileWriter("{}/{}".format(graph_path, "eval"), sess.graph) mixed_wav, src1_wav, src2_wav, med_names = data.next_wavs(num_wav) mixed_spec = to_spectrogram(mixed_wav, len_frame, len_hop) mixed_mag = get_magnitude(mixed_spec) mixed_batch, padded_mixed_mag = model.spec_to_batch(mixed_mag) mixed_phase = get_phase(mixed_spec) assert (np.all(np.equal(model.batch_to_spec(mixed_batch, num_wav), padded_mixed_mag))) (pred_src1_mag, pred_src2_mag) = sess.run(model(), feed_dict={model.x_mixed: mixed_batch}) seq_len = mixed_phase.shape[-1] pred_src1_mag = model.batch_to_spec(pred_src1_mag, num_wav)[:, :, :seq_len] pred_src2_mag = model.batch_to_spec(pred_src2_mag, num_wav)[:, :, :seq_len] # Time-frequency masking mask_src1 = soft_time_freq_mask(pred_src1_mag, pred_src2_mag) # mask_src1 = hard_time_freq_mask(pred_src1_mag, pred_src2_mag) mask_src2 = 1. - mask_src1 pred_src1_mag = mixed_mag * mask_src1 pred_src2_mag = mixed_mag * mask_src2 # (magnitude, phase) -> spectrogram -> wav if glim: pred_src1_wav = to_wav_mag_only(pred_src1_mag, mixed_phase, len_frame, len_hop, num_iters=glim_iter) pred_src2_wav = to_wav_mag_only(pred_src2_mag, mixed_phase, len_frame, len_hop, num_iters=glim_iter) else: pred_src1_wav = to_wav(pred_src1_mag, mixed_phase, len_hop) pred_src2_wav = to_wav(pred_src2_mag, mixed_phase, len_hop) # Write the result tf.summary.audio('GT_mixed', mixed_wav, sr, max_outputs=num_wav) tf.summary.audio('Pred_music', pred_src1_wav, sr, max_outputs=num_wav) tf.summary.audio('Pred_vocal', pred_src2_wav, sr, max_outputs=num_wav) # Compute BSS metrics gnsdr, gsir, gsar = bss_eval_global(mixed_wav, src1_wav, src2_wav, pred_src1_wav, pred_src2_wav, num_wav) # Write the score of BSS metrics tf.summary.scalar('GNSDR_music', gnsdr[0]) tf.summary.scalar('GSIR_music', gsir[0]) tf.summary.scalar('GSAR_music', gsar[0]) tf.summary.scalar('GNSDR_vocal', gnsdr[1]) tf.summary.scalar('GSIR_vocal', gsir[1]) tf.summary.scalar('GSAR_vocal', gsar[1]) # Write the result for i in range(len(med_names)): write_wav(mixed_wav[i], '{}/{}-{}'.format(result_path, med_names[i], 'all_stems_mixed'), sr) write_wav(pred_src1_wav[i], '{}/{}-{}'.format(result_path, med_names[i], 'target_instrument'), sr) write_wav(pred_src2_wav[i], '{}/{}-{}'.format(result_path, med_names[i], 'other_stems_mixed'), sr) writer.add_summary(sess.run(tf.summary.merge_all()), global_step=global_step.eval()) writer.close()
def separate(filename, channel): with tf.Graph().as_default(): # Model model = Model(ModelConfig.HIDDEN_LAYERS, ModelConfig.HIDDEN_UNITS) global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step') total_samples, origin_samples, samples = decode_input(filename) channels = origin_samples.shape[0] with tf.Session(config=EvalConfig.session_conf) as sess: # Initialized, Load state sess.run(tf.global_variables_initializer()) model.load_state(sess, EvalConfig.CKPT_PATH) mixed_wav, src1_wav, src2_wav = samples, samples, samples mixed_spec = to_spectrogram(mixed_wav) mixed_mag = get_magnitude(mixed_spec) mixed_batch, padded_mixed_mag = model.spec_to_batch(mixed_mag) mixed_phase = get_phase(mixed_spec) (pred_src1_mag, pred_src2_mag) = sess.run(model(), feed_dict={model.x_mixed: mixed_batch}) seq_len = mixed_phase.shape[-1] pred_src1_mag = model.batch_to_spec(pred_src1_mag, 1)[:, :, :seq_len] pred_src2_mag = model.batch_to_spec(pred_src2_mag, 1)[:, :, :seq_len] # Time-frequency masking mask_src1 = soft_time_freq_mask(pred_src1_mag, pred_src2_mag) # mask_src1 = hard_time_freq_mask(pred_src1_mag, pred_src2_mag) mask_src2 = 1. - mask_src1 pred_src1_mag = mixed_mag * mask_src1 pred_src2_mag = mixed_mag * mask_src2 # (magnitude, phase) -> spectrogram -> wav if EvalConfig.GRIFFIN_LIM: pred_src1_wav = to_wav_mag_only( pred_src1_mag, init_phase=mixed_phase, num_iters=EvalConfig.GRIFFIN_LIM_ITER) pred_src2_wav = to_wav_mag_only( pred_src2_mag, init_phase=mixed_phase, num_iters=EvalConfig.GRIFFIN_LIM_ITER) else: pred_src1_wav = to_wav(pred_src1_mag, mixed_phase) pred_src2_wav = to_wav(pred_src2_mag, mixed_phase) def stack(data): size = data.shape[0] // channels elements = [] for i in range(channels): elements.append(data[size * i:size * (i + 1)]) return np.dstack(elements)[0] music_data = pred_src1_wav voice_data = pred_src2_wav if channel >= 0: def filter_samples(data): for i in range(origin_samples.shape[0]): if i != channel: data[i, :] = origin_samples[i, 0:data.shape[1]] return data music_data = filter_samples(music_data) voice_data = filter_samples(voice_data) music_wav = np.dstack(music_data)[0] voice_wav = np.dstack(voice_data)[0] return music_wav, voice_wav return None