示例#1
0
def no_separation(args):
    """Write out un-separated mixture as baseline. 
    """
    workspace = args.workspace

    out_dir = os.path.join(workspace, "separated_wavs", "no_separation")
    pp_data.create_folder(out_dir)

    audio_dir = os.path.join(workspace, "mixed_audio", "testing")
    names = os.listdir(audio_dir)

    for na in names:
        if '.mix_0db.wav' in na:
            print(na)
            audio_path = os.path.join(audio_dir, na)
            (bg_audio, event_audio, fs) = pp_data.read_audio_stereo(audio_path)
            mixed_audio = bg_audio + event_audio

            bare_na = os.path.splitext(os.path.splitext(na)[0])[0]
            pp_data.write_audio(os.path.join(out_dir, bare_na + ".sep_bg.wav"),
                                mixed_audio, fs)
            pp_data.write_audio(
                os.path.join(out_dir, bare_na + ".sep_event.wav"), mixed_audio,
                fs)

    print("Write out finished!")
示例#2
0
def ibm_separation(args):
    """Ideal binary mask (IBM) source separation. 
    """
    workspace = args.workspace

    out_dir = os.path.join(workspace, "separated_wavs", "ibm_separation")
    pp_data.create_folder(out_dir)

    audio_dir = os.path.join(workspace, "mixed_audio", "testing")
    names = os.listdir(audio_dir)

    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    clip_sec = cfg.clip_sec

    ham_win = np.hamming(n_window)
    recover_scaler = np.sqrt((ham_win**2).sum())

    for na in names:
        if '.mix_0db.wav' in na:
            print(na)
            bare_na = os.path.splitext(os.path.splitext(na)[0])[0]
            audio_path = os.path.join(audio_dir, na)
            (bg_audio, event_audio, fs) = pp_data.read_audio_stereo(audio_path)
            mixed_audio = bg_audio + event_audio

            [f, t, bg_spec] = signal.spectral.spectrogram(x=bg_audio,
                                                          window=ham_win,
                                                          nperseg=n_window,
                                                          noverlap=n_overlap,
                                                          detrend=False,
                                                          return_onesided=True,
                                                          scaling='density',
                                                          mode='magnitude')

            [f, t,
             event_spec] = signal.spectral.spectrogram(x=event_audio,
                                                       window=ham_win,
                                                       nperseg=n_window,
                                                       noverlap=n_overlap,
                                                       detrend=False,
                                                       return_onesided=True,
                                                       scaling='density',
                                                       mode='magnitude')

            [f, t,
             mixed_spec] = signal.spectral.spectrogram(x=mixed_audio,
                                                       window=ham_win,
                                                       nperseg=n_window,
                                                       noverlap=n_overlap,
                                                       detrend=False,
                                                       return_onesided=True,
                                                       scaling='density',
                                                       mode='complex')

            bg_spec = bg_spec.T
            event_spec = event_spec.T
            mixed_spec = mixed_spec.T

            ratio = 1.7  # 5 dB
            event_mask = (np.sign(event_spec / (bg_spec * ratio) - 1) + 1) / 2
            bg_mask = 1. - event_mask

            bg_separated_spec = np.abs(mixed_spec) * bg_mask
            event_separated_spec = np.abs(mixed_spec) * event_mask

            # Write out separated music
            s = spectrogram_to_wave.recover_wav(bg_separated_spec,
                                                mixed_spec,
                                                n_overlap=n_overlap,
                                                winfunc=np.hamming,
                                                wav_len=int(fs * clip_sec))
            s *= recover_scaler
            pp_data.write_audio(os.path.join(out_dir, bare_na + ".sep_bg.wav"),
                                s, fs)

            # Write out separated vocal
            s = spectrogram_to_wave.recover_wav(event_separated_spec,
                                                mixed_spec,
                                                n_overlap=n_overlap,
                                                winfunc=np.hamming,
                                                wav_len=int(fs * clip_sec))
            s *= recover_scaler
            pp_data.write_audio(
                os.path.join(out_dir, bare_na + ".sep_event.wav"), s, fs)

    print("Finished!")
def evaluate_separation(args):
    workspace = args.workspace
    
    audio_dir = os.path.join(workspace, "mixed_audio", "testing")
    separated_dir = os.path.join(workspace, "separated_wavs", args.sep_type)
    ix_to_lb = cfg.ix_to_lb
    
    dict = {}
    for e in cfg.events + ['bg']:
        dict[e] = {'sdr_list': [], 'sir_list': [], 'sar_list': []}
    
    names = os.listdir(audio_dir)
    cnt = 0
    for na in names:
        if '.yaml' in na:
            bare_name = os.path.splitext(na)[0]
            
            # Read yaml
            yaml_path = os.path.join(audio_dir, na)
            with open(yaml_path, 'r') as f:
                data = yaml.load(f)
            event_type = data['event_type']
            
            # Read audio
            gt_audio_path = os.path.join(audio_dir, "%s.mix_0db.wav" % bare_name)
            (gt_bg_audio, gt_event_audio, _) = pp_data.read_audio_stereo(gt_audio_path)
            
            sep_bg_audio_path = os.path.join(separated_dir, "%s.sep_bg.wav" % bare_name)
            (sep_bg_audio, _) = pp_data.read_audio_sum_if_stereo(sep_bg_audio_path)
            sep_event_audio_path = os.path.join(separated_dir, "%s.sep_event.wav" % bare_name)
            (sep_event_audio, _) = pp_data.read_audio_sum_if_stereo(sep_event_audio_path)
            
            # Evaluate SDR, SIR and SAR
            gt_array = np.array((gt_bg_audio, gt_event_audio))
            sep_array = np.array((sep_bg_audio, sep_event_audio))
            
            (sdr, sir, sar, perm) = bss_eval_sources(gt_array, sep_array, compute_permutation=False)
            logging.info("%d, %s, %s" % (cnt, na, event_type))
            logging.info((sdr, sir, sar, perm))
            
            dict[event_type]['sdr_list'].append(sdr[1])
            dict[event_type]['sir_list'].append(sir[1])
            dict[event_type]['sar_list'].append(sar[1])
            dict['bg']['sdr_list'].append(sdr[0])
            dict['bg']['sir_list'].append(sir[0])
            dict['bg']['sar_list'].append(sar[0])
    
            cnt += 1
        
    avg = {}
    for e in ['sdr', 'sir', 'sar']:
        avg[e] = []
        
    for event_type in dict.keys():
        logging.info(event_type)
        for evaluate_type in dict[event_type]:
            tmp = np.mean(dict[event_type][evaluate_type])
            logging.info((evaluate_type, tmp))
            avg[evaluate_type[0:3]].append(tmp)

    logging.info("Average stats:")
    for e in ['sdr', 'sir', 'sar']:
        logging.info("%s, %f" % (e, np.mean(avg[e])))
示例#4
0
def jsc_separation(args):
    """Joing separation-classification (JSC) source separation. 
    """
    workspace = args.workspace

    scaler_path = os.path.join(workspace, "scalers", "logmel",
                               "training.scaler")
    scaler = pickle.load(open(scaler_path, 'rb'))

    md_path = os.path.join(workspace, "models", "main", args.model_name)
    md = serializations.load(md_path)

    out_dir = os.path.join(workspace, "separated_wavs", "jsc_separation")
    pp_data.create_folder(out_dir)

    observe_nodes = [md.find_layer('seg_masks').output_]
    f_forward = md.get_observe_forward_func(observe_nodes)

    audio_dir = os.path.join(os.path.join(workspace, "mixed_audio", "testing"))
    names = os.listdir(audio_dir)

    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    ham_win = np.hamming(n_window)
    recover_scaler = np.sqrt((ham_win**2).sum())

    melW = librosa.filters.mel(sr=fs,
                               n_fft=n_window,
                               n_mels=64,
                               fmin=0.,
                               fmax=fs / 2)
    inverse_melW = get_inverse_W(melW)

    for na in names:
        if ".mix" in na:
            # Read yaml
            bare_name = os.path.splitext(os.path.splitext(na)[0])[0]
            yaml_path = os.path.join(audio_dir, "%s.yaml" % bare_name)
            with open(yaml_path, 'r') as f:
                data = yaml.load(f)
            event_type = data['event_type']
            print(na, event_type)

            # Read audio
            audio_path = os.path.join(audio_dir, na)
            (bg_audio, event_audio, _) = pp_data.read_audio_stereo(audio_path)
            mixed_audio = bg_audio + event_audio

            # Spectrogram
            [f, t, bg_spec] = signal.spectral.spectrogram(x=bg_audio,
                                                          window=ham_win,
                                                          nperseg=n_window,
                                                          noverlap=n_overlap,
                                                          detrend=False,
                                                          return_onesided=True,
                                                          scaling='density',
                                                          mode='complex')

            [f, t,
             event_spec] = signal.spectral.spectrogram(x=event_audio,
                                                       window=ham_win,
                                                       nperseg=n_window,
                                                       noverlap=n_overlap,
                                                       detrend=False,
                                                       return_onesided=True,
                                                       scaling='density',
                                                       mode='complex')

            [f, t,
             mixed_spec] = signal.spectral.spectrogram(x=mixed_audio,
                                                       window=ham_win,
                                                       nperseg=n_window,
                                                       noverlap=n_overlap,
                                                       detrend=False,
                                                       return_onesided=True,
                                                       scaling='density',
                                                       mode='complex')

            bg_spec = bg_spec.T
            event_spec = event_spec.T
            mixed_spec = mixed_spec.T

            # Log Mel spectrogram
            mixed_x = pp_data.calc_feat(mixed_audio)
            x3d = pp_data.do_scaler_on_x3d(mixed_x[np.newaxis, ...], scaler)

            # Segmentation masks
            [mel_masks] = md.run_function(f_forward,
                                          x3d,
                                          batch_size=10,
                                          tr_phase=0.)
            mel_masks = mel_masks[0]  # (n_time, 64)
            spec_masks = np.dot(mel_masks, inverse_melW)  # (n_time, 513)

            if args.plot_only:
                mixed_mel_spec = np.dot(np.abs(mixed_spec), melW.T)
                bg_mel_spec = np.dot(np.abs(bg_spec), melW.T)
                event_mel_spec = np.dot(np.abs(event_spec), melW.T)
                ratio = 1.7  # 5 dB
                event_mask = (np.sign(event_mel_spec /
                                      (bg_mel_spec * ratio) - 1) + 1) / 2

                fig, axs = plt.subplots(3, 2, sharex=True)
                axs[0, 0].matshow(np.log(mixed_mel_spec.T),
                                  origin='lower',
                                  aspect='auto')
                axs[0, 1].matshow(event_mask.T, origin='lower', aspect='auto')
                axs[1, 0].matshow(spec_masks[0].T,
                                  origin='lower',
                                  aspect='auto',
                                  vmin=0.,
                                  vmax=1.)
                axs[1, 1].matshow(spec_masks[1].T,
                                  origin='lower',
                                  aspect='auto',
                                  vmin=0.,
                                  vmax=1.)
                axs[2, 0].matshow(spec_masks[2].T,
                                  origin='lower',
                                  aspect='auto',
                                  vmin=0.,
                                  vmax=1.)
                axs[2, 1].matshow(spec_masks[3].T,
                                  origin='lower',
                                  aspect='auto',
                                  vmin=0.,
                                  vmax=1.)
                axs[0, 0].set_title('log Mel of mixture')
                axs[0, 1].set_title('IBM of event')
                axs[1, 0].set_title('babycry')
                axs[1, 1].set_title('glassbreak')
                axs[2, 0].set_title('gunshot')
                axs[2, 1].set_title('bg')

                plt.show()

            else:
                # Separated spec
                separated_specs = spec_masks * np.abs(mixed_spec)[None, :, :]

                # Write out all events and bg
                enlarged_events = cfg.events + ['bg']
                for i1 in xrange(4):
                    s = spectrogram_to_wave.recover_wav(
                        separated_specs[i1],
                        mixed_spec,
                        n_overlap=n_overlap,
                        winfunc=np.hamming,
                        wav_len=len(mixed_audio))
                    s *= recover_scaler
                    pp_data.write_audio(
                        os.path.join(
                            out_dir, "%s.sep_%s.wav" %
                            (bare_name, enlarged_events[i1])), s, fs)

                # Write out event
                s = spectrogram_to_wave.recover_wav(
                    separated_specs[cfg.lb_to_ix[event_type]],
                    mixed_spec,
                    n_overlap=n_overlap,
                    winfunc=np.hamming,
                    wav_len=len(mixed_audio))
                s *= recover_scaler
                pp_data.write_audio(
                    os.path.join(out_dir, "%s.sep_event.wav" % bare_name), s,
                    fs)

                # Write out origin mix
                pp_data.write_audio(
                    os.path.join(out_dir, "%s.sep_mix.wav" % bare_name),
                    mixed_audio, fs)
示例#5
0
def recognize(args):
    workspace = args.workspace
    md_path = os.path.join(workspace, "models", pp_data.get_filename(__file__),
                           args.model_name)
    t1 = time.time()

    # Load scaler.
    scaler_path = os.path.join(workspace, "scalers", "logmel",
                               "training.scaler")
    scaler = pickle.load(open(scaler_path, 'rb'))

    # Load model.
    md = serializations.load(md_path)

    # Observe function.
    observe_nodes = [md.find_layer('seg_masks').output_]
    f_forward = md.get_observe_forward_func(observe_nodes)

    audio_dir = os.path.join(workspace, "mixed_audio", "testing")
    names = os.listdir(audio_dir)

    at_pd_ary = []
    at_gt_ary = []
    sed_pd_ary = []
    sed_gt_ary = []

    # For all audio clips.
    for na in names:
        if '.mix_0db.wav' in na:
            logging.info(na)

            # Load audio.
            bare_na = os.path.splitext(os.path.splitext(na)[0])[0]
            audio_path = os.path.join(audio_dir, na)
            (bg_audio, event_audio, fs) = pp_data.read_audio_stereo(audio_path)
            mixed_audio = bg_audio + event_audio

            # Load yaml.
            yaml_path = os.path.join(audio_dir, "%s.yaml" % bare_na)
            with open(yaml_path, 'r') as f:
                data = yaml.load(f)
            event_type = data['event_type']

            # Calculate feature.
            x = pp_data.calc_feat(mixed_audio)
            x3d = pp_data.do_scaler_on_x3d(x[np.newaxis, ...], scaler)

            # Ground truth.
            gt_y = [0, 0, 0, 0]
            gt_y[cfg.lb_to_ix[event_type]] = 1
            at_gt_ary.append(gt_y)

            # Audio tagging (AT) prediction.
            [pred_y] = md.predict(x3d)  # (1, n_events+1)
            pred_y = pred_y[0]  # (n_events+1,)
            at_pd_ary.append(pred_y)

            # Sound event detection (SED) prediction.
            [masks] = md.run_function(
                f_forward, x3d, batch_size=10,
                tr_phase=0.)  # (1, n_events+1, n_time, n_freq)
            masks = masks[0]  # (n_events+1, n_time, n_freq)
            sed_pd = np.mean(masks, axis=-1).T  # (n_time, n_events+1)
            sed_pd_ary.append(sed_pd)
            sed_gt = np.zeros_like(sed_pd)
            [bgn_sec, fin_sec] = data['event_segment']
            bgn_fr = int(bgn_sec * cfg.sample_rate /
                         float(cfg.n_window - cfg.n_overlap))
            fin_fr = int(fin_sec * cfg.sample_rate /
                         float(cfg.n_window - cfg.n_overlap))
            sed_gt[bgn_fr:fin_fr, cfg.lb_to_ix[event_type]] = 1
            sed_gt_ary.append(sed_gt)

    at_pd_ary = np.array(at_pd_ary)
    at_gt_ary = np.array(at_gt_ary)
    sed_pd_ary = np.array(sed_pd_ary)
    sed_gt_ary = np.array(sed_gt_ary)

    # Write out AT and SED presence probabilites.
    logging.info("at_pd_ary.shape: %s" % (at_pd_ary.shape, ))
    logging.info("at_gt_ary.shape: %s" % (at_gt_ary.shape, ))
    logging.info("sed_pd_ary.shape: %s" % (sed_pd_ary.shape, ))
    logging.info("sed_gt_ary.shape: %s" % (sed_gt_ary.shape, ))
    dict = {}
    dict['at_pd_ary'] = at_pd_ary
    dict['at_gt_ary'] = at_gt_ary
    dict['sed_pd_ary'] = sed_pd_ary
    dict['sed_gt_ary'] = sed_gt_ary
    out_path = os.path.join(workspace, "_tmp", "_at_sed_dict.p")
    pp_data.create_folder(os.path.dirname(out_path))
    cPickle.dump(dict, open(out_path, 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)
    logging.info("Recognize time: %s" % (time.time() - t1, ))