def no_separation(args): """Write out un-separated mixture as baseline. """ workspace = args.workspace out_dir = os.path.join(workspace, "separated_wavs", "no_separation") pp_data.create_folder(out_dir) audio_dir = os.path.join(workspace, "mixed_audio", "testing") names = os.listdir(audio_dir) for na in names: if '.mix_0db.wav' in na: print(na) audio_path = os.path.join(audio_dir, na) (bg_audio, event_audio, fs) = pp_data.read_audio_stereo(audio_path) mixed_audio = bg_audio + event_audio bare_na = os.path.splitext(os.path.splitext(na)[0])[0] pp_data.write_audio(os.path.join(out_dir, bare_na + ".sep_bg.wav"), mixed_audio, fs) pp_data.write_audio( os.path.join(out_dir, bare_na + ".sep_event.wav"), mixed_audio, fs) print("Write out finished!")
def ibm_separation(args): """Ideal binary mask (IBM) source separation. """ workspace = args.workspace out_dir = os.path.join(workspace, "separated_wavs", "ibm_separation") pp_data.create_folder(out_dir) audio_dir = os.path.join(workspace, "mixed_audio", "testing") names = os.listdir(audio_dir) n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate clip_sec = cfg.clip_sec ham_win = np.hamming(n_window) recover_scaler = np.sqrt((ham_win**2).sum()) for na in names: if '.mix_0db.wav' in na: print(na) bare_na = os.path.splitext(os.path.splitext(na)[0])[0] audio_path = os.path.join(audio_dir, na) (bg_audio, event_audio, fs) = pp_data.read_audio_stereo(audio_path) mixed_audio = bg_audio + event_audio [f, t, bg_spec] = signal.spectral.spectrogram(x=bg_audio, window=ham_win, nperseg=n_window, noverlap=n_overlap, detrend=False, return_onesided=True, scaling='density', mode='magnitude') [f, t, event_spec] = signal.spectral.spectrogram(x=event_audio, window=ham_win, nperseg=n_window, noverlap=n_overlap, detrend=False, return_onesided=True, scaling='density', mode='magnitude') [f, t, mixed_spec] = signal.spectral.spectrogram(x=mixed_audio, window=ham_win, nperseg=n_window, noverlap=n_overlap, detrend=False, return_onesided=True, scaling='density', mode='complex') bg_spec = bg_spec.T event_spec = event_spec.T mixed_spec = mixed_spec.T ratio = 1.7 # 5 dB event_mask = (np.sign(event_spec / (bg_spec * ratio) - 1) + 1) / 2 bg_mask = 1. - event_mask bg_separated_spec = np.abs(mixed_spec) * bg_mask event_separated_spec = np.abs(mixed_spec) * event_mask # Write out separated music s = spectrogram_to_wave.recover_wav(bg_separated_spec, mixed_spec, n_overlap=n_overlap, winfunc=np.hamming, wav_len=int(fs * clip_sec)) s *= recover_scaler pp_data.write_audio(os.path.join(out_dir, bare_na + ".sep_bg.wav"), s, fs) # Write out separated vocal s = spectrogram_to_wave.recover_wav(event_separated_spec, mixed_spec, n_overlap=n_overlap, winfunc=np.hamming, wav_len=int(fs * clip_sec)) s *= recover_scaler pp_data.write_audio( os.path.join(out_dir, bare_na + ".sep_event.wav"), s, fs) print("Finished!")
def evaluate_separation(args): workspace = args.workspace audio_dir = os.path.join(workspace, "mixed_audio", "testing") separated_dir = os.path.join(workspace, "separated_wavs", args.sep_type) ix_to_lb = cfg.ix_to_lb dict = {} for e in cfg.events + ['bg']: dict[e] = {'sdr_list': [], 'sir_list': [], 'sar_list': []} names = os.listdir(audio_dir) cnt = 0 for na in names: if '.yaml' in na: bare_name = os.path.splitext(na)[0] # Read yaml yaml_path = os.path.join(audio_dir, na) with open(yaml_path, 'r') as f: data = yaml.load(f) event_type = data['event_type'] # Read audio gt_audio_path = os.path.join(audio_dir, "%s.mix_0db.wav" % bare_name) (gt_bg_audio, gt_event_audio, _) = pp_data.read_audio_stereo(gt_audio_path) sep_bg_audio_path = os.path.join(separated_dir, "%s.sep_bg.wav" % bare_name) (sep_bg_audio, _) = pp_data.read_audio_sum_if_stereo(sep_bg_audio_path) sep_event_audio_path = os.path.join(separated_dir, "%s.sep_event.wav" % bare_name) (sep_event_audio, _) = pp_data.read_audio_sum_if_stereo(sep_event_audio_path) # Evaluate SDR, SIR and SAR gt_array = np.array((gt_bg_audio, gt_event_audio)) sep_array = np.array((sep_bg_audio, sep_event_audio)) (sdr, sir, sar, perm) = bss_eval_sources(gt_array, sep_array, compute_permutation=False) logging.info("%d, %s, %s" % (cnt, na, event_type)) logging.info((sdr, sir, sar, perm)) dict[event_type]['sdr_list'].append(sdr[1]) dict[event_type]['sir_list'].append(sir[1]) dict[event_type]['sar_list'].append(sar[1]) dict['bg']['sdr_list'].append(sdr[0]) dict['bg']['sir_list'].append(sir[0]) dict['bg']['sar_list'].append(sar[0]) cnt += 1 avg = {} for e in ['sdr', 'sir', 'sar']: avg[e] = [] for event_type in dict.keys(): logging.info(event_type) for evaluate_type in dict[event_type]: tmp = np.mean(dict[event_type][evaluate_type]) logging.info((evaluate_type, tmp)) avg[evaluate_type[0:3]].append(tmp) logging.info("Average stats:") for e in ['sdr', 'sir', 'sar']: logging.info("%s, %f" % (e, np.mean(avg[e])))
def jsc_separation(args): """Joing separation-classification (JSC) source separation. """ workspace = args.workspace scaler_path = os.path.join(workspace, "scalers", "logmel", "training.scaler") scaler = pickle.load(open(scaler_path, 'rb')) md_path = os.path.join(workspace, "models", "main", args.model_name) md = serializations.load(md_path) out_dir = os.path.join(workspace, "separated_wavs", "jsc_separation") pp_data.create_folder(out_dir) observe_nodes = [md.find_layer('seg_masks').output_] f_forward = md.get_observe_forward_func(observe_nodes) audio_dir = os.path.join(os.path.join(workspace, "mixed_audio", "testing")) names = os.listdir(audio_dir) n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate ham_win = np.hamming(n_window) recover_scaler = np.sqrt((ham_win**2).sum()) melW = librosa.filters.mel(sr=fs, n_fft=n_window, n_mels=64, fmin=0., fmax=fs / 2) inverse_melW = get_inverse_W(melW) for na in names: if ".mix" in na: # Read yaml bare_name = os.path.splitext(os.path.splitext(na)[0])[0] yaml_path = os.path.join(audio_dir, "%s.yaml" % bare_name) with open(yaml_path, 'r') as f: data = yaml.load(f) event_type = data['event_type'] print(na, event_type) # Read audio audio_path = os.path.join(audio_dir, na) (bg_audio, event_audio, _) = pp_data.read_audio_stereo(audio_path) mixed_audio = bg_audio + event_audio # Spectrogram [f, t, bg_spec] = signal.spectral.spectrogram(x=bg_audio, window=ham_win, nperseg=n_window, noverlap=n_overlap, detrend=False, return_onesided=True, scaling='density', mode='complex') [f, t, event_spec] = signal.spectral.spectrogram(x=event_audio, window=ham_win, nperseg=n_window, noverlap=n_overlap, detrend=False, return_onesided=True, scaling='density', mode='complex') [f, t, mixed_spec] = signal.spectral.spectrogram(x=mixed_audio, window=ham_win, nperseg=n_window, noverlap=n_overlap, detrend=False, return_onesided=True, scaling='density', mode='complex') bg_spec = bg_spec.T event_spec = event_spec.T mixed_spec = mixed_spec.T # Log Mel spectrogram mixed_x = pp_data.calc_feat(mixed_audio) x3d = pp_data.do_scaler_on_x3d(mixed_x[np.newaxis, ...], scaler) # Segmentation masks [mel_masks] = md.run_function(f_forward, x3d, batch_size=10, tr_phase=0.) mel_masks = mel_masks[0] # (n_time, 64) spec_masks = np.dot(mel_masks, inverse_melW) # (n_time, 513) if args.plot_only: mixed_mel_spec = np.dot(np.abs(mixed_spec), melW.T) bg_mel_spec = np.dot(np.abs(bg_spec), melW.T) event_mel_spec = np.dot(np.abs(event_spec), melW.T) ratio = 1.7 # 5 dB event_mask = (np.sign(event_mel_spec / (bg_mel_spec * ratio) - 1) + 1) / 2 fig, axs = plt.subplots(3, 2, sharex=True) axs[0, 0].matshow(np.log(mixed_mel_spec.T), origin='lower', aspect='auto') axs[0, 1].matshow(event_mask.T, origin='lower', aspect='auto') axs[1, 0].matshow(spec_masks[0].T, origin='lower', aspect='auto', vmin=0., vmax=1.) axs[1, 1].matshow(spec_masks[1].T, origin='lower', aspect='auto', vmin=0., vmax=1.) axs[2, 0].matshow(spec_masks[2].T, origin='lower', aspect='auto', vmin=0., vmax=1.) axs[2, 1].matshow(spec_masks[3].T, origin='lower', aspect='auto', vmin=0., vmax=1.) axs[0, 0].set_title('log Mel of mixture') axs[0, 1].set_title('IBM of event') axs[1, 0].set_title('babycry') axs[1, 1].set_title('glassbreak') axs[2, 0].set_title('gunshot') axs[2, 1].set_title('bg') plt.show() else: # Separated spec separated_specs = spec_masks * np.abs(mixed_spec)[None, :, :] # Write out all events and bg enlarged_events = cfg.events + ['bg'] for i1 in xrange(4): s = spectrogram_to_wave.recover_wav( separated_specs[i1], mixed_spec, n_overlap=n_overlap, winfunc=np.hamming, wav_len=len(mixed_audio)) s *= recover_scaler pp_data.write_audio( os.path.join( out_dir, "%s.sep_%s.wav" % (bare_name, enlarged_events[i1])), s, fs) # Write out event s = spectrogram_to_wave.recover_wav( separated_specs[cfg.lb_to_ix[event_type]], mixed_spec, n_overlap=n_overlap, winfunc=np.hamming, wav_len=len(mixed_audio)) s *= recover_scaler pp_data.write_audio( os.path.join(out_dir, "%s.sep_event.wav" % bare_name), s, fs) # Write out origin mix pp_data.write_audio( os.path.join(out_dir, "%s.sep_mix.wav" % bare_name), mixed_audio, fs)
def recognize(args): workspace = args.workspace md_path = os.path.join(workspace, "models", pp_data.get_filename(__file__), args.model_name) t1 = time.time() # Load scaler. scaler_path = os.path.join(workspace, "scalers", "logmel", "training.scaler") scaler = pickle.load(open(scaler_path, 'rb')) # Load model. md = serializations.load(md_path) # Observe function. observe_nodes = [md.find_layer('seg_masks').output_] f_forward = md.get_observe_forward_func(observe_nodes) audio_dir = os.path.join(workspace, "mixed_audio", "testing") names = os.listdir(audio_dir) at_pd_ary = [] at_gt_ary = [] sed_pd_ary = [] sed_gt_ary = [] # For all audio clips. for na in names: if '.mix_0db.wav' in na: logging.info(na) # Load audio. bare_na = os.path.splitext(os.path.splitext(na)[0])[0] audio_path = os.path.join(audio_dir, na) (bg_audio, event_audio, fs) = pp_data.read_audio_stereo(audio_path) mixed_audio = bg_audio + event_audio # Load yaml. yaml_path = os.path.join(audio_dir, "%s.yaml" % bare_na) with open(yaml_path, 'r') as f: data = yaml.load(f) event_type = data['event_type'] # Calculate feature. x = pp_data.calc_feat(mixed_audio) x3d = pp_data.do_scaler_on_x3d(x[np.newaxis, ...], scaler) # Ground truth. gt_y = [0, 0, 0, 0] gt_y[cfg.lb_to_ix[event_type]] = 1 at_gt_ary.append(gt_y) # Audio tagging (AT) prediction. [pred_y] = md.predict(x3d) # (1, n_events+1) pred_y = pred_y[0] # (n_events+1,) at_pd_ary.append(pred_y) # Sound event detection (SED) prediction. [masks] = md.run_function( f_forward, x3d, batch_size=10, tr_phase=0.) # (1, n_events+1, n_time, n_freq) masks = masks[0] # (n_events+1, n_time, n_freq) sed_pd = np.mean(masks, axis=-1).T # (n_time, n_events+1) sed_pd_ary.append(sed_pd) sed_gt = np.zeros_like(sed_pd) [bgn_sec, fin_sec] = data['event_segment'] bgn_fr = int(bgn_sec * cfg.sample_rate / float(cfg.n_window - cfg.n_overlap)) fin_fr = int(fin_sec * cfg.sample_rate / float(cfg.n_window - cfg.n_overlap)) sed_gt[bgn_fr:fin_fr, cfg.lb_to_ix[event_type]] = 1 sed_gt_ary.append(sed_gt) at_pd_ary = np.array(at_pd_ary) at_gt_ary = np.array(at_gt_ary) sed_pd_ary = np.array(sed_pd_ary) sed_gt_ary = np.array(sed_gt_ary) # Write out AT and SED presence probabilites. logging.info("at_pd_ary.shape: %s" % (at_pd_ary.shape, )) logging.info("at_gt_ary.shape: %s" % (at_gt_ary.shape, )) logging.info("sed_pd_ary.shape: %s" % (sed_pd_ary.shape, )) logging.info("sed_gt_ary.shape: %s" % (sed_gt_ary.shape, )) dict = {} dict['at_pd_ary'] = at_pd_ary dict['at_gt_ary'] = at_gt_ary dict['sed_pd_ary'] = sed_pd_ary dict['sed_gt_ary'] = sed_gt_ary out_path = os.path.join(workspace, "_tmp", "_at_sed_dict.p") pp_data.create_folder(os.path.dirname(out_path)) cPickle.dump(dict, open(out_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL) logging.info("Recognize time: %s" % (time.time() - t1, ))