Exemplo n.º 1
0
def demo(args):
    """Inference all test data, write out recovered wavs to disk.

    Args:
      workspace: str, path of workspace.
      tr_snr: float, training SNR.
      te_snr: float, testing SNR.
      n_concat: int, number of frames to concatenta, should equal to n_concat
          in the training stage.
      iter: int, iteration of model to load.
      visualize: bool, plot enhanced spectrogram for debug.
    """
    print(args)
    workspace = args.workspace
    tr_snr = args.tr_snr
    te_snr = args.te_snr
    n_concat = args.n_concat
    iter = args.iteration

    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    scale = True

    # Load model.
    model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr), "FullyCNN.h5")
    model = load_model(model_path)

    # Load test data.
    if args.online:
        print('recording....')
        recordfile = 'record.wav'
        my_record(recordfile, 16000, 2)
        print('recording end')
        (data, _) = pp_data.read_audio(recordfile, 16000)
    else:
        testfile = 'data_cache/test_speech/1568253725.587787.wav'
        (data, _) = pp_data.read_audio(testfile, 16000)
    mixed_complx_x = pp_data.calc_sp(data, mode='complex')
    mixed_x, mixed_phase = divide_magphase(mixed_complx_x, power=1)

    # Predict.
    pred = model.predict(mixed_x)
    # Recover enhanced wav.
    pred_sp = pred  # np.exp(pred)
    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    hop_size = n_window - n_overlap
    ham_win = np.sqrt(np.hanning(n_window))
    stft_reconstructed_clean = merge_magphase(pred_sp, mixed_phase)
    stft_reconstructed_clean = stft_reconstructed_clean.T
    signal_reconstructed_clean = librosa.istft(stft_reconstructed_clean, hop_length=hop_size, window=ham_win)
    signal_reconstructed_clean = signal_reconstructed_clean*32768
    s = signal_reconstructed_clean.astype('int16')

    # Write out enhanced wav.
    # out_path = os.path.join(workspace, "enh_wavs", "test", "%ddb" % int(te_snr), "%s.enh.wav" % na)
    # pp_data.create_folder(os.path.dirname(out_path))
    pp_data.write_audio('1568253725.587787ehs.wav', s, fs)
Exemplo n.º 2
0
def predict_file(file_path, model, scaler):

    (a, _) = pp.read_audio(file_path)
    mixed_complex = pp.calc_sp(a, 'complex')

    mixed_x = np.abs(mixed_complex)

    # Process data.
    n_pad = (conf1.n_concat - 1) / 2
    mixed_x = pp.pad_with_border(mixed_x, n_pad)
    mixed_x = pp.log_sp(mixed_x)
    # speech_x = dnn1_train.log_sp(speech_x)


    # Scale data.
    # if scale:
    mixed_x = pp.scale_on_2d(mixed_x, scaler)
    # speech_x = pp.scale_on_2d(speech_x, scaler)

    # Cut input spectrogram to 3D segments with n_concat.
    mixed_x_3d = pp.mat_2d_to_3d(mixed_x, agg_num=conf1.n_concat, hop=1)

    # Predict.
    pred = model.predict(mixed_x_3d)

    if visualize_plot:
        visualize(mixed_x, pred)
    # Inverse scale.
    # if scale:
    mixed_x = pp.inverse_scale_on_2d(mixed_x, scaler)
    # speech_x = dnn1_train.inverse_scale_on_2d(speech_x, scaler)
    pred = pp.inverse_scale_on_2d(pred, scaler)


    # Debug plot.

    # Recover enhanced wav.
    pred_sp = np.exp(pred)
    s = recover_wav(pred_sp, mixed_complex, conf1.n_overlap, np.hamming)
    s *= np.sqrt((np.hamming(conf1.n_window) ** 2).sum())  # Scaler for compensate the amplitude
    # change after spectrogram and IFFT.

    # Write out enhanced wav.

    # audio_path = os.path.dirname(file_path)
    # pp.write_audio(audio_path, s, conf1.sample_rate)

    return mixed_complex, pred, s
def plot_fig4(data_type, audio_idx):
    workspace = cfg.workspace
    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    events = cfg.events
    te_fold = cfg.te_fold

    # Read audio.
    audio_path = os.path.join(
        workspace, "mixed_audio/n_events=3/%s.mixed_20db.wav" % audio_idx)
    (audio, _) = pp_data.read_audio(audio_path, fs)

    # Calculate log Mel.
    x = _calc_feat(audio)
    sp = _calc_spectrogram(audio)
    print(x.shape)

    # Plot.
    fig, axs = plt.subplots(4, 4, sharex=False)

    # Plot log Mel spectrogram.
    for i2 in xrange(16):
        axs[i2 / 4, i2 % 4].set_visible(False)

    axs[0, 0].matshow(x.T, origin='lower', aspect='auto', cmap='jet')
    axs[0, 0].xaxis.set_ticks([0, 60, 120, 180, 239])
    axs[0, 0].xaxis.tick_bottom()
    axs[0, 0].xaxis.set_ticklabels(np.arange(0, 10.1, 2.5))
    axs[0, 0].set_xlabel("time (s)")
    # axs[0,0].xaxis.set_label_coords(1.12, -0.05)

    axs[0, 0].yaxis.set_ticks([0, 16, 32, 48, 63])
    axs[0, 0].yaxis.set_ticklabels([0, 16, 32, 48, 63])
    axs[0, 0].set_ylabel('Mel freq. bin')

    axs[0, 0].set_title("Log Mel spectrogram")
    axs[0, 0].set_visible(True)

    # Plot spectrogram.
    axs[0, 2].matshow(np.log(sp.T + 1.),
                      origin='lower',
                      aspect='auto',
                      cmap='jet')
    axs[0, 2].xaxis.set_ticks([0, 60, 120, 180, 239])
    axs[0, 2].xaxis.tick_bottom()
    axs[0, 2].xaxis.set_ticklabels(np.arange(0, 10.1, 2.5))
    axs[0, 2].set_xlabel("time (s)")
    # axs[0,2].xaxis.set_label_coords(1.12, -0.05)

    axs[0, 2].yaxis.set_ticks([0, 128, 256, 384, 512])
    axs[0, 2].yaxis.set_ticklabels([0, 128, 256, 384, 512])
    axs[0, 2].set_ylabel('FFT freq. bin')

    axs[0, 2].set_title("Spectrogram")
    axs[0, 2].set_visible(True)

    # plt.tight_layout()
    plt.show()

    # Load data.
    snr = 20
    n_events = 3
    feature_dir = os.path.join(workspace, "features", "logmel",
                               "n_events=%d" % n_events)
    yaml_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events)
    (tr_x, tr_at_y, tr_sed_y, tr_na_list, te_x, te_at_y, te_sed_y,
     te_na_list) = pp_data.load_data(feature_dir=feature_dir,
                                     yaml_dir=yaml_dir,
                                     te_fold=te_fold,
                                     snr=snr,
                                     is_scale=is_scale)

    if data_type == "train":
        x = tr_x
        at_y = tr_at_y
        sed_y = tr_sed_y
        na_list = tr_na_list
    elif data_type == "test":
        x = te_x
        at_y = te_at_y
        sed_y = te_sed_y
        na_list = te_na_list

    for (i1, na) in enumerate(na_list):
        if audio_idx in na:
            idx = i1
    print(idx)

    # GT mask
    (stereo_audio, _) = pp_data.read_stereo_audio(audio_path, target_fs=fs)
    event_audio = stereo_audio[:, 0]
    noise_audio = stereo_audio[:, 1]
    mixed_audio = event_audio + noise_audio

    ham_win = np.hamming(n_window)
    mixed_cmplx_sp = pp_data.calc_sp(mixed_audio, fs, ham_win, n_window,
                                     n_overlap)
    mixed_sp = np.abs(mixed_cmplx_sp)
    event_sp = np.abs(
        pp_data.calc_sp(event_audio, fs, ham_win, n_window, n_overlap))
    noise_sp = np.abs(
        pp_data.calc_sp(noise_audio, fs, ham_win, n_window, n_overlap))

    db = -5.
    gt_mask = (np.sign(20 * np.log10(event_sp / noise_sp) - db) +
               1.) / 2.  # (n_time, n_freq)
    fig, axs = plt.subplots(4, 4, sharex=True)
    for i2 in xrange(16):
        ind_gt_mask = gt_mask * sed_y[idx, :, i2][:, None]
        axs[i2 / 4, i2 % 4].matshow(ind_gt_mask.T,
                                    origin='lower',
                                    aspect='auto',
                                    cmap='jet')
        # axs[i2/4, i2%4].set_title(events[i2])
        axs[i2 / 4, i2 % 4].xaxis.set_ticks([])
        axs[i2 / 4, i2 % 4].yaxis.set_ticks([])
        axs[i2 / 4, i2 % 4].set_xlabel('time')
        axs[i2 / 4, i2 % 4].set_ylabel('FFT freq. bin')
    plt.show()

    for filename in ["tmp01", "tmp02", "tmp03"]:
        # Plot up sampled seg masks.
        preds_dir = os.path.join(workspace, "preds", filename,
                                 "n_events=%d" % n_events, "fold=%d" % te_fold,
                                 "snr=%d" % snr)

        at_probs_list, seg_masks_list = [], []
        bgn_iter, fin_iter, interval = 2000, 3001, 200
        for iter in xrange(bgn_iter, fin_iter, interval):
            seg_masks_path = os.path.join(preds_dir, "md%d_iters" % iter,
                                          "seg_masks.p")
            seg_masks = cPickle.load(open(seg_masks_path, 'rb'))
            seg_masks_list.append(seg_masks)
        seg_masks = np.mean(seg_masks_list,
                            axis=0)  # (n_clips, n_classes, n_time, n_freq)

        print(at_y[idx])

        melW = librosa.filters.mel(sr=fs,
                                   n_fft=cfg.n_window,
                                   n_mels=64,
                                   fmin=0.,
                                   fmax=fs / 2)
        inverse_melW = get_inverse_W(melW)

        spec_masks = np.dot(seg_masks[idx],
                            inverse_melW)  # (n_classes, n_time, 513)

        fig, axs = plt.subplots(4, 4, sharex=True)
        for i2 in xrange(16):
            axs[i2 / 4, i2 % 4].matshow(spec_masks[i2].T,
                                        origin='lower',
                                        aspect='auto',
                                        vmin=0,
                                        vmax=1,
                                        cmap='jet')
            # axs[i2/4, i2%4].set_title(events[i2])
            axs[i2 / 4, i2 % 4].xaxis.set_ticks([])
            axs[i2 / 4, i2 % 4].yaxis.set_ticks([])
            axs[i2 / 4, i2 % 4].set_xlabel('time')
            axs[i2 / 4, i2 % 4].set_ylabel('FFT freq. bin')
        fig.suptitle(filename)
        plt.show()

        # Plot SED probs.
        sed_probs = np.mean(seg_masks[idx], axis=-1)  # (n_classes, n_time)
        fig, axs = plt.subplots(4, 4, sharex=False)
        for i2 in xrange(16):
            axs[i2 / 4, i2 % 4].set_visible(False)
        axs[0, 0].matshow(sed_probs,
                          origin='lower',
                          aspect='auto',
                          vmin=0,
                          vmax=1,
                          cmap='jet')
        # axs[0, 0].xaxis.set_ticks([0, 60, 120, 180, 239])
        # axs[0, 0].xaxis.tick_bottom()
        # axs[0, 0].xaxis.set_ticklabels(np.arange(0, 10.1, 2.5))
        axs[0, 0].xaxis.set_ticks([])
        # axs[0, 0].set_xlabel('time (s)')
        axs[0, 0].yaxis.set_ticks(xrange(len(events)))
        axs[0, 0].yaxis.set_ticklabels(events)
        for tick in axs[0, 0].yaxis.get_major_ticks():
            tick.label.set_fontsize(8)
        axs[0, 0].set_visible(True)

        axs[1, 0].matshow(sed_y[idx].T,
                          origin='lower',
                          aspect='auto',
                          vmin=0,
                          vmax=1,
                          cmap='jet')
        # axs[1, 0].xaxis.set_ticks([])
        axs[1, 0].xaxis.set_ticks([0, 60, 120, 180, 239])
        axs[1, 0].xaxis.tick_bottom()
        axs[1, 0].xaxis.set_ticklabels(np.arange(0, 10.1, 2.5))
        axs[1, 0].set_xlabel('time (s)')
        axs[1, 0].yaxis.set_ticks(xrange(len(events)))
        axs[1, 0].yaxis.set_ticklabels(events)
        for tick in axs[1, 0].yaxis.get_major_ticks():
            tick.label.set_fontsize(8)
        axs[1, 0].set_visible(True)
        fig.suptitle(filename)
        plt.show()
Exemplo n.º 4
0
def prepare_database():

    (noise, _) = pp.read_audio(conf1.noise_path)

    with open('dnn1/dnn1_files_list.txt') as f:
        dnn1_data = f.readlines()

    # generate train spectrograms
    mixed_all = []
    clean_all = []

    snr1_list = []
    mixed_avg = []

    for n in range(conf1.training_number):
        current_file = (random.choice(dnn1_data)).rstrip()
        dist = random.uniform(1, 20)
        (clean, _) = pp.read_audio(current_file)

        mixed, noise_new, clean_new, snr = set_microphone_at_distance(
            clean, noise, conf1.fs, dist)

        snr1_list.append(snr)
        mixed_avg.append(np.mean(mixed))

        if n % 10 == 0:
            print(n)

        if conf1.save_single_files and n < conf1.n_files_to_save:

            sr = ''.join(
                random.choice(string.ascii_uppercase + string.digits)
                for _ in range(5))

            path_list = current_file.split(os.sep)
            mixed_name = "mix_%s_%s_%s" % (path_list[2], sr,
                                           os.path.basename(current_file))
            clean_name = "clean_%s_%s_%s" % (path_list[2], sr,
                                             os.path.basename(current_file))
            path_list = current_file.split(os.sep)

            mixed_path = os.path.join(conf1.train_folder, mixed_name)
            clean_path = os.path.join(conf1.train_folder, clean_name)

            pp.write_audio(mixed_path, mixed, conf1.fs)
            pp.write_audio(clean_path, clean_new, conf1.fs)

        clean_spec = pp.calc_sp(clean_new, mode='magnitude')
        mixed_spec = pp.calc_sp(mixed, mode='complex')

        clean_all.append(clean_spec)
        mixed_all.append(mixed_spec)

    print(len(clean_all), ',', len(mixed_all))
    num_tr = pp.pack_features(mixed_all, clean_all, 'train')

    compute_scaler('train')

    # generate test spectrograms
    mixed_all = []
    clean_all = []

    snr1_list = []
    mixed_avg = []

    for n in range(conf1.test_number):
        current_file = (random.choice(dnn1_data)).rstrip()
        dist = random.uniform(1, 20)
        (clean, _) = pp.read_audio(current_file)

        mixed, noise_new, clean_new, snr = set_microphone_at_distance(
            clean, noise, conf1.fs, dist)

        snr1_list.append(snr)
        mixed_avg.append(np.mean(mixed))

        if n % 10 == 0:
            print(n)

        if conf1.save_single_files and n < conf1.n_files_to_save:

            sr = ''.join(
                random.choice(string.ascii_uppercase + string.digits)
                for _ in range(5))

            path_list = current_file.split(os.sep)
            mixed_name = "mix_%s_%s_%s" % (path_list[2], sr,
                                           os.path.basename(current_file))
            clean_name = "clean_%s_%s_%s" % (path_list[2], sr,
                                             os.path.basename(current_file))

            mixed_path = os.path.join(conf1.test_folder, mixed_name)
            clean_path = os.path.join(conf1.test_folder, clean_name)

            pp.write_audio(mixed_path, mixed, conf1.fs)
            pp.write_audio(clean_path, clean_new, conf1.fs)

        clean_spec = pp.calc_sp(clean_new, mode='magnitude')
        mixed_spec = pp.calc_sp(mixed, mode='complex')

        clean_all.append(clean_spec)
        mixed_all.append(mixed_spec)

    print(len(clean_all), ',', len(mixed_all))

    num_te = pp.pack_features(mixed_all, clean_all, 'test')

    compute_scaler('test')

    return num_tr, num_te,
def separate(args, bgn_iter, fin_iter, interval):
    workspace = cfg.workspace
    events = cfg.events
    te_fold = cfg.te_fold
    n_events = args.n_events
    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    clip_duration = cfg.clip_duration
    snr = args.snr

    # Load ground truth data.
    feature_dir = os.path.join(workspace, "features", "logmel",
                               "n_events=%d" % n_events)
    yaml_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events)
    (tr_x, tr_at_y, tr_sed_y, tr_na_list, te_x, te_at_y, te_sed_y,
     te_na_list) = pp_data.load_data(feature_dir=feature_dir,
                                     yaml_dir=yaml_dir,
                                     te_fold=te_fold,
                                     snr=snr,
                                     is_scale=is_scale)

    at_y = te_at_y
    sed_y = te_sed_y
    na_list = te_na_list

    # Load and sum
    preds_dir = os.path.join(workspace, "preds",
                             pp_data.get_filename(__file__),
                             "n_events=%d" % n_events, "fold=%d" % te_fold,
                             "snr=%d" % snr)

    at_probs_list, seg_masks_list = [], []
    for iter in xrange(bgn_iter, fin_iter, interval):
        seg_masks_path = os.path.join(preds_dir, "md%d_iters" % iter,
                                      "seg_masks.p")
        seg_masks = cPickle.load(open(seg_masks_path, 'rb'))
        seg_masks_list.append(seg_masks)
    seg_masks = np.mean(seg_masks_list,
                        axis=0)  # (n_clips, n_classes, n_time, n_freq)

    print(seg_masks.shape)

    #
    audio_dir = os.path.join(workspace, "mixed_audio",
                             "n_events=%d" % n_events)

    sep_dir = os.path.join(workspace, "sep_audio",
                           pp_data.get_filename(__file__),
                           "n_events=%d" % n_events, "fold=%d" % te_fold,
                           "snr=%d" % snr)
    pp_data.create_folder(sep_dir)

    ham_win = np.hamming(n_window)
    recover_scaler = np.sqrt((ham_win**2).sum())
    melW = librosa.filters.mel(sr=fs,
                               n_fft=n_window,
                               n_mels=64,
                               fmin=0.,
                               fmax=fs / 2)
    inverse_melW = get_inverse_W(melW)  # (64, 513)

    seg_stats = {}
    for e in events:
        seg_stats[e] = {
            'fvalue': [],
            'auc': [],
            'iou': [],
            'hit': [],
            'fa': [],
            'tp': [],
            'fn': [],
            'fp': []
        }

    cnt = 0
    for (i1, na) in enumerate(na_list):
        bare_na = os.path.splitext(na)[0]
        audio_path = os.path.join(audio_dir, "%s.wav" % bare_na)
        (stereo_audio, _) = pp_data.read_stereo_audio(audio_path, target_fs=fs)
        event_audio = stereo_audio[:, 0]
        noise_audio = stereo_audio[:, 1]
        mixed_audio = event_audio + noise_audio

        mixed_cmplx_sp = pp_data.calc_sp(mixed_audio, fs, ham_win, n_window,
                                         n_overlap)
        mixed_sp = np.abs(mixed_cmplx_sp)
        event_sp = np.abs(
            pp_data.calc_sp(event_audio, fs, ham_win, n_window, n_overlap))
        noise_sp = np.abs(
            pp_data.calc_sp(noise_audio, fs, ham_win, n_window, n_overlap))

        sm = seg_masks[i1]  # (n_classes, n_time, n_freq)
        sm_upsampled = np.dot(sm, inverse_melW)  # (n_classes, n_time, 513)

        print(na)

        # Write out separated events.
        for j1 in xrange(len(events)):
            if at_y[i1][j1] == 1:
                (fvalue, auc, iou, tp, fn, fp) = fvalue_iou(sm_upsampled[j1],
                                                            event_sp,
                                                            noise_sp,
                                                            sed_y[i1, :, j1],
                                                            seg_thres,
                                                            inside_only=True)
                (hit, fa) = hit_fa(sm_upsampled[j1],
                                   event_sp,
                                   noise_sp,
                                   sed_y[i1, :, j1],
                                   seg_thres,
                                   inside_only=True)
                seg_stats[events[j1]]['fvalue'].append(fvalue)
                seg_stats[events[j1]]['auc'].append(auc)
                seg_stats[events[j1]]['iou'].append(iou)
                seg_stats[events[j1]]['hit'].append(hit)
                seg_stats[events[j1]]['fa'].append(fa)
                seg_stats[events[j1]]['tp'].append(tp)
                seg_stats[events[j1]]['fn'].append(fn)
                seg_stats[events[j1]]['fp'].append(fp)

                sep_event_sp = sm_upsampled[j1] * mixed_sp
                sep_event_s = spectrogram_to_wave.recover_wav(
                    sep_event_sp,
                    mixed_cmplx_sp,
                    n_overlap=n_overlap,
                    winfunc=np.hamming,
                    wav_len=int(fs * clip_duration))
                sep_event_s *= recover_scaler

                out_event_audio_path = os.path.join(
                    sep_dir, "%s.%s.wav" % (bare_na, events[j1]))
                pp_data.write_audio(out_event_audio_path, sep_event_s, fs)

        # Write out separated noise.
        sm_noise_upsampled = np.clip(1. - np.sum(sm_upsampled, axis=0), 0., 1.)
        sep_noise_sp = sm_noise_upsampled * mixed_sp
        sep_noise_s = spectrogram_to_wave.recover_wav(sep_noise_sp,
                                                      mixed_cmplx_sp,
                                                      n_overlap=n_overlap,
                                                      winfunc=np.hamming,
                                                      wav_len=int(
                                                          fs * clip_duration))
        sep_noise_s *= recover_scaler
        out_noise_audio_path = os.path.join(sep_dir, "%s.noise.wav" % bare_na)
        pp_data.write_audio(out_noise_audio_path, sep_noise_s, fs)

        cnt += 1
        # if cnt == 2: break


    fvalues, aucs, ious, hits, fas, tps, fns, fps = [], [], [], [], [], [], [], []
    for e in events:
        fvalues.append(np.mean(seg_stats[e]['fvalue']))
        ious.append(np.mean(seg_stats[e]['iou']))
        aucs.append(np.mean(seg_stats[e]['auc']))
        hits.append(np.mean(seg_stats[e]['hit']))
        fas.append(np.mean(seg_stats[e]['fa']))
        tps.append(np.mean(seg_stats[e]['tp']))
        fns.append(np.mean(seg_stats[e]['fn']))
        fps.append(np.mean(seg_stats[e]['fp']))

    logging.info("%sfvalue\tauc\tiou\tHit\tFa\tHit-Fa\tTP\tFN\tFP" %
                 ("".ljust(16)))
    logging.info(
        "%s*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f" %
        ("*Avg. of each".ljust(16), np.mean(fvalues), np.mean(aucs),
         np.mean(ious), np.mean(hits), np.mean(fas), np.mean(hits) -
         np.mean(fas), np.mean(tps), np.mean(fns), np.mean(fps)))
    for i1 in xrange(len(events)):
        logging.info(
            "%s%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f" %
            (events[i1].ljust(16), fvalues[i1], aucs[i1], ious[i1], hits[i1],
             fas[i1], hits[i1] - fas[i1], tps[i1], fns[i1], fps[i1]))
Exemplo n.º 6
0
def inference_wiener(args):
    workspace = args.workspace
    iter = args.iteration
    stack_num = args.stack_num
    filename = args.filename
    mini_num = args.mini_num
    visualize = args.visualize
    cuda = args.use_cuda and torch.cuda.is_available()
    print("cuda:", cuda)

    sample_rate = cfg.sample_rate
    fft_size = cfg.fft_size
    hop_size = cfg.hop_size
    window_type = cfg.window_type

    if window_type == 'hamming':
        window = np.hamming(fft_size)

    # Audio
    audio_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/mixed_audios/spectrogram/test/0db"
    # audio_dir = "/user/HS229/qk00006/my_code2015.5-/python/pub_speech_enhancement/mixture2clean_dnn/workspace/mixed_audios/spectrogram/test/0db"
    names = os.listdir(audio_dir)

    # Load model.
    target_type = ['speech', 'noise']
    model_dict = {}
    for e in target_type:
        n_freq = 257
        model = DNN(stack_num, n_freq)
        model_path = os.path.join(workspace, "models", filename, e,
                                  "md_%d_iters.tar" % iter)
        checkpoint = torch.load(model_path)
        model.load_state_dict(checkpoint['state_dict'])

        # Move model to GPU.
        if cuda:
            model.cuda()
        model.eval()

        model_dict[e] = model

    # Load scalar
    scalar_path = os.path.join(workspace, "scalars", filename, "scalar.p")
    (mean_, std_) = cPickle.load(open(scalar_path, 'rb'))
    mean_ = move_data_to_gpu(mean_, cuda, volatile=True)
    std_ = move_data_to_gpu(std_, cuda, volatile=True)

    if mini_num > 0:
        n_every = len(names) / mini_num
    else:
        n_every = 1

    out_wav_dir = os.path.join(workspace, "enh_wavs", filename)
    pp_data.create_folder(out_wav_dir)

    for (cnt, name) in enumerate(names):
        if cnt % n_every == 0:
            audio_path = os.path.join(audio_dir, name)
            (audio, _) = pp_data.read_audio(audio_path, sample_rate)

            audio = pp_data.normalize(audio)
            cmplx_sp = pp_data.calc_sp(audio, fft_size, hop_size, window)
            x = np.abs(cmplx_sp)

            # Process data.
            n_pad = (stack_num - 1) / 2
            x = pp_data.pad_with_border(x, n_pad)
            x = pp_data.mat_2d_to_3d(x, stack_num, hop=1)

            # Predict.
            pred_dict = {}
            for e in target_type:
                pred = forward(model_dict[e], x, mean_, std_, cuda)
                pred = pred.data.cpu().numpy()
                pred_dict[e] = pred
            print(cnt, name)

            # Wiener filter.
            pred_mag_sp = pred_dict['speech'] / (
                pred_dict['speech'] + pred_dict['noise']) * np.abs(cmplx_sp)

            pred_cmplx_sp = stft.real_to_complex(pred_mag_sp, cmplx_sp)
            frames = stft.istft(pred_cmplx_sp)

            cola_constant = stft.get_cola_constant(hop_size, window)
            seq = stft.overlap_add(frames, hop_size, cola_constant)
            seq = seq[0:len(audio)]

            # Write out wav
            out_wav_path = os.path.join(out_wav_dir, name)
            pp_data.write_audio(out_wav_path, seq, sample_rate)
            print("Write out wav to: %s" % out_wav_path)

            if visualize:
                vmin = -5.
                vmax = 5.
                fig, axs = plt.subplots(3, 1, sharex=True)
                axs[0].matshow(np.log(np.abs(cmplx_sp)).T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                axs[1].matshow(np.log(np.abs(pred_dict['speech'])).T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                axs[2].matshow(np.log(np.abs(pred_dict['noise'])).T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                plt.show()
Exemplo n.º 7
0
Arquivo: tmp01.py Projeto: zqy1/sednn
def inference(args):
    workspace = args.workspace
    model_name = args.model_name
    stack_num = args.stack_num
    filename = args.filename
    mini_num = args.mini_num
    visualize = args.visualize
    cuda = args.use_cuda and torch.cuda.is_available()
    print("cuda:", cuda)

    sample_rate = cfg.sample_rate
    fft_size = cfg.fft_size
    hop_size = cfg.hop_size
    window_type = cfg.window_type

    if window_type == 'hamming':
        window = np.hamming(fft_size)

    # Audio
    audio_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/mixed_audios/spectrogram/test/0db"
    names = os.listdir(audio_dir)

    # Load model
    model_path = os.path.join(workspace, "models", filename, model_name)
    n_freq = 257
    model = DNN(stack_num, n_freq)
    checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint['state_dict'])

    if cuda:
        model.cuda()

    # Load scalar
    scalar_path = os.path.join(workspace, "scalars", filename, "scalar.p")
    (mean_, std_) = cPickle.load(open(scalar_path, 'rb'))
    mean_ = move_data_to_gpu(mean_, cuda, volatile=True)
    std_ = move_data_to_gpu(std_, cuda, volatile=True)

    if mini_num > 0:
        n_every = len(names) / mini_num
    else:
        n_every = 1

    for (cnt, name) in enumerate(names):
        if cnt % n_every == 0:
            audio_path = os.path.join(audio_dir, name)
            (audio, _) = pp_data.read_audio(audio_path, sample_rate)

            audio = pp_data.normalize(audio)
            sp = pp_data.calc_sp(audio, fft_size, hop_size, window)
            x = np.abs(sp)

            # Process data.
            n_pad = (stack_num - 1) / 2
            x = pp_data.pad_with_border(x, n_pad)
            x = pp_data.mat_2d_to_3d(x, stack_num, hop=1)

            output = forward(model, x, mean_, std_, cuda)
            output = output.data.cpu().numpy()

            print(output.shape)
            if visualize:
                fig, axs = plt.subplots(2, 1, sharex=True)
                axs[0].matshow(np.log(np.abs(sp)).T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                axs[1].matshow(np.log(np.abs(output)).T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                plt.show()

            import crash
            pause
Exemplo n.º 8
0
def dab_run(snr_list, file_name="dab_out", mode='dab'):

    output_file_folder = os.path.join("data_eval", mode)

    # removing previous enhancements
    for file in os.listdir(os.path.join("data_eval", "dnn1_out")):
        file_path = os.path.join("data_eval", "dnn1_out", file)
        os.remove(file_path)

    dnn1_inputs, dnn1_outputs = dnn1.predict_folder(
        os.path.join("data_eval", "dnn1_in"),
        os.path.join("data_eval", "dnn1_out"))

    names = [
        f for f in sorted(os.listdir(os.path.join("data_eval", "dnn1_out")))
        if f.startswith("enh")
    ]
    dnn1_outputs = []
    for (cnt, na) in enumerate(names):
        # Load feature.
        file_path = os.path.join("data_eval", "dnn1_out", na)
        (a, _) = pp.read_audio(file_path)
        enh_complex = pp.calc_sp(a, 'complex')
        dnn1_outputs.append(enh_complex)

    # s2nrs = dnn2.predict("data_eval/dnn1_in", "data_eval/dnn1_out")

    # snr = np.array([5.62, 1.405, 0.703, 0.281])
    # snr = np.array([5.62, 2.81, 1.875, 1.406])
    s2nrs = snr_list * 1
    for i in range(len(snr_list)):
        s2nrs[i] = 1 / (1 + 1 / snr_list[i])

    ch_rw_outputs = []
    # calculate channel weights
    if mode == 'dab':
        new_weights = channel_weights(s2nrs)
        print(new_weights)
        # multiply enhanced audio for the corresponding weight
        for i, p in zip(dnn1_outputs, new_weights):
            ch_rw_outputs.append(p * i)

    # cancel reweighting if db mode
    if mode == 'db':
        new_weights = s2nrs
        print(new_weights)
        ch_rw_outputs = dnn1_outputs

    # execute mvdr
    final = mvdr(dnn1_inputs, ch_rw_outputs)

    (init,
     _) = pp.read_audio(os.path.join('data_eval', 'test_speech', file_name))
    init_sp = pp.calc_sp(init, mode='complex')

    visualize(dnn1_colors(np.abs(init_sp)), dnn1_colors(np.abs(final)),
              "source amplitude", "final amplitude")

    # Recover and save enhanced wav
    pp.create_folder(output_file_folder)
    s = recover_wav_complex(final, conf1.n_overlap, np.hamming)
    s *= np.sqrt((np.hamming(
        conf1.n_window)**2).sum())  # Scaler for compensate the amplitude
    audio_path = os.path.join(output_file_folder, file_name)
    pp.write_audio(audio_path, s, conf1.sample_rate)

    print('%s done' % mode)
Exemplo n.º 9
0
def predict_folder(input_file_folder: object, output_file_folder: object) -> object:
    # Load model.
    data_type = "test"
    model_path = os.path.join(conf1.model_dir, "md_%diters.h5" % conf1.iterations)
    model = load_model(model_path)

    # Load scaler.
    # if scale:
    scaler_path = os.path.join(conf1.packed_feature_dir, data_type, "scaler.p")
    scaler = pickle.load(open(scaler_path, 'rb'))

    # Load test data.
    # names = os.listdir(input_file_folder)

    names = [f for f in sorted(os.listdir(input_file_folder)) if f.startswith("mix")]

    mixed_all = []
    pred_all = []
    for (cnt, na) in enumerate(names):
        # Load feature.
        file_path = os.path.join(input_file_folder, na)
        (a, _) = pp.read_audio(file_path)
        mixed_complex = pp.calc_sp(a, 'complex')


        mixed_x = np.abs(mixed_complex)

        # Process data.
        n_pad = (conf1.n_concat - 1) / 2
        mixed_x = pp.pad_with_border(mixed_x, n_pad)
        mixed_x = pp.log_sp(mixed_x)
        # speech_x = dnn1_train.log_sp(speech_x)

        # Scale data.
        # if scale:
        mixed_x = pp.scale_on_2d(mixed_x, scaler)

        # Cut input spectrogram to 3D segments with n_concat.
        mixed_x_3d = pp.mat_2d_to_3d(mixed_x, agg_num=conf1.n_concat, hop=1)


        # Predict.
        pred = model.predict(mixed_x_3d)
        print(cnt, na)

        # Inverse scale.
        #if scale:
        mixed_x = pp.inverse_scale_on_2d(mixed_x, scaler)
        # speech_x = dnn1_train.inverse_scale_on_2d(speech_x, scaler)
        pred = pp.inverse_scale_on_2d(pred, scaler)

        # Debug plot.
        if visualize_plot:
            visualize(mixed_x, pred)

        mixed_all.append(mixed_complex)
        pred_all.append(real_to_complex(pred, mixed_complex))


        # Recover enhanced wav.
        pred_sp = np.exp(pred)
        s = recover_wav(pred_sp, mixed_complex, conf1.n_overlap, np.hamming)
        s *= np.sqrt((np.hamming(conf1.n_window) ** 2).sum())  # Scaler for compensate the amplitude
        # change after spectrogram and IFFT.

        # Write out enhanced wav.

        pp.create_folder(output_file_folder)
        audio_path = os.path.join(output_file_folder, "enh_%s" % na)
        pp.write_audio(audio_path, s, conf1.sample_rate)

    return mixed_all, pred_all