示例#1
0
def separate(args, bgn_iter, fin_iter, interval):
    workspace = cfg.workspace
    events = cfg.events
    te_fold = cfg.te_fold
    n_events = args.n_events
    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    clip_duration = cfg.clip_duration
    snr = args.snr
    
    # Load ground truth data. 
    feature_dir = os.path.join(workspace, "features", "logmel", "n_events=%d" % n_events)
    yaml_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events)
    (tr_x, tr_at_y, tr_sed_y, tr_na_list, 
     te_x, te_at_y, te_sed_y, te_na_list) = pp_data.load_data(
        feature_dir=feature_dir, 
        yaml_dir=yaml_dir, 
        te_fold=te_fold, 
        snr=snr, 
        is_scale=is_scale)
    
    at_y = te_at_y
    sed_y = te_sed_y
    na_list = te_na_list
    
    
    # Load and sum 
    preds_dir = os.path.join(workspace, "preds", pp_data.get_filename(__file__), 
                          "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr)

    at_probs_list, seg_masks_list = [], []
    for iter in xrange(bgn_iter, fin_iter, interval):
        seg_masks_path = os.path.join(preds_dir, "md%d_iters" % iter, "seg_masks.p")
        seg_masks = cPickle.load(open(seg_masks_path, 'rb'))
        seg_masks_list.append(seg_masks)
    seg_masks = np.mean(seg_masks_list, axis=0) # (n_clips, n_classes, n_time, n_freq)
    
    print(seg_masks.shape)
    
    # 
    audio_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events)
    
    sep_dir = os.path.join(workspace, "sep_audio", pp_data.get_filename(__file__), 
                          "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr)
    pp_data.create_folder(sep_dir)
    
    ham_win = np.hamming(n_window)
    recover_scaler = np.sqrt((ham_win**2).sum())
    melW = librosa.filters.mel(sr=fs, 
                                n_fft=n_window, 
                                n_mels=64, 
                                fmin=0., 
                                fmax=fs / 2)
    inverse_melW = get_inverse_W(melW)  # (64, 513)
    
    seg_stats = {}
    for e in events:
        seg_stats[e] = {'fvalue': [], 'auc': [], 'iou': [], 'hit': [], 'fa': [], 'tp': [], 'fn': [], 'fp': []}
    
    cnt = 0
    for (i1, na) in enumerate(na_list):
        bare_na = os.path.splitext(na)[0]
        audio_path = os.path.join(audio_dir, "%s.wav" % bare_na)
        (stereo_audio, _) = pp_data.read_stereo_audio(audio_path, target_fs=fs)
        event_audio = stereo_audio[:, 0]
        noise_audio = stereo_audio[:, 1]
        mixed_audio = event_audio + noise_audio
        
        mixed_cmplx_sp = pp_data.calc_sp(mixed_audio, fs, ham_win, n_window, n_overlap)
        mixed_sp = np.abs(mixed_cmplx_sp)
        event_sp = np.abs(pp_data.calc_sp(event_audio, fs, ham_win, n_window, n_overlap))
        noise_sp = np.abs(pp_data.calc_sp(noise_audio, fs, ham_win, n_window, n_overlap))

        sm = seg_masks[i1]  # (n_classes, n_time, n_freq)
        sm_upsampled = np.dot(sm, inverse_melW)  # (n_classes, n_time, 513)
        
        print(na)
        
        # Write out separated events. 
        for j1 in xrange(len(events)):
            if at_y[i1][j1] == 1:
                (fvalue, auc, iou, tp, fn, fp) = fvalue_iou(sm_upsampled[j1], event_sp, noise_sp, sed_y[i1, :, j1], seg_thres, inside_only=True)
                (hit, fa) = hit_fa(sm_upsampled[j1], event_sp, noise_sp, sed_y[i1, :, j1], seg_thres, inside_only=True)
                seg_stats[events[j1]]['fvalue'].append(fvalue)
                seg_stats[events[j1]]['auc'].append(auc)
                seg_stats[events[j1]]['iou'].append(iou)
                seg_stats[events[j1]]['hit'].append(hit)
                seg_stats[events[j1]]['fa'].append(fa)
                seg_stats[events[j1]]['tp'].append(tp)
                seg_stats[events[j1]]['fn'].append(fn)
                seg_stats[events[j1]]['fp'].append(fp)
      
                sep_event_sp = sm_upsampled[j1] * mixed_sp
                sep_event_s = spectrogram_to_wave.recover_wav(sep_event_sp, mixed_cmplx_sp, n_overlap=n_overlap, winfunc=np.hamming, wav_len=int(fs * clip_duration))
                sep_event_s *= recover_scaler
                                
                out_event_audio_path = os.path.join(sep_dir, "%s.%s.wav" % (bare_na, events[j1]))
                pp_data.write_audio(out_event_audio_path, sep_event_s, fs)
       
        # Write out separated noise. 
        sm_noise_upsampled = np.clip(1. - np.sum(sm_upsampled, axis=0), 0., 1.)
        sep_noise_sp = sm_noise_upsampled * mixed_sp
        sep_noise_s = spectrogram_to_wave.recover_wav(sep_noise_sp, mixed_cmplx_sp, n_overlap=n_overlap, winfunc=np.hamming, wav_len=int(fs * clip_duration))
        sep_noise_s *= recover_scaler
        out_noise_audio_path = os.path.join(sep_dir, "%s.noise.wav" % bare_na)
        pp_data.write_audio(out_noise_audio_path, sep_noise_s, fs)
        
       
        cnt += 1
        # if cnt == 2: break
        
    
    fvalues, aucs, ious, hits, fas, tps, fns, fps = [], [], [], [], [], [], [], []
    for e in events:
        fvalues.append(np.mean(seg_stats[e]['fvalue']))
        ious.append(np.mean(seg_stats[e]['iou']))
        aucs.append(np.mean(seg_stats[e]['auc']))
        hits.append(np.mean(seg_stats[e]['hit']))
        fas.append(np.mean(seg_stats[e]['fa']))
        tps.append(np.mean(seg_stats[e]['tp']))
        fns.append(np.mean(seg_stats[e]['fn']))
        fps.append(np.mean(seg_stats[e]['fp']))
    
    logging.info("%sfvalue\tauc\tiou\tHit\tFa\tHit-Fa\tTP\tFN\tFP" % ("".ljust(16)))
    logging.info("%s*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f" % ("*Avg. of each".ljust(16), np.mean(fvalues), np.mean(aucs), np.mean(ious), np.mean(hits), np.mean(fas), np.mean(hits) - np.mean(fas), np.mean(tps), np.mean(fns), np.mean(fps)))    
    for i1 in xrange(len(events)):
        logging.info("%s%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f" % (events[i1].ljust(16), fvalues[i1], aucs[i1], ious[i1], hits[i1], fas[i1], hits[i1] - fas[i1], tps[i1], fns[i1], fps[i1]))
示例#2
0
def inference(args):
    """Inference all test data, write out recovered wavs to disk. 
    
    Args:
      workspace: str, path of workspace. 
      tr_snr: float, training SNR. 
      te_snr: float, testing SNR. 
      n_concat: int, number of frames to concatenta, should equal to n_concat 
          in the training stage. 
      iter: int, iteration of model to load. 
      visualize: bool, plot enhanced spectrogram for debug. 
    """
    print(args)
    workspace = args.workspace
    tr_snr = args.tr_snr
    te_snr = args.te_snr
    n_concat = args.n_concat
    iter = args.iteration
    calc_log = args.calc_log
    model_file = args.model_file

    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    scale = True

    # Build model
    n_concat = 7
    n_freq = 257
    n_hid = 2048
    lr = 1e-3

    model = Sequential()
    model.add(Flatten(input_shape=(n_concat, n_freq)))
    model.add(Dropout(0.1))
    model.add(Dense(n_hid, activation='relu'))
    model.add(Dense(n_hid, activation='relu'))
    model.add(Dense(n_hid, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Dense(n_hid, activation='relu'))
    model.add(Dense(n_hid, activation='relu'))
    model.add(Dense(n_hid, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(n_hid, activation='relu'))
    model.add(Dense(n_hid, activation='relu'))
    model.add(Dense(n_hid, activation='relu'))
    model.add(Dropout(0.2))
    if calc_log:
        model.add(Dense(n_freq, activation='linear'))
    else:
        model.add(Dense(n_freq, activation='relu'))
    model.summary()

    model.compile(loss='mean_absolute_error', optimizer=Adam(lr=lr))

    # Load model.
    if (model_file == "null"):
        model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr),
                                  "md_%diters.h5" % iter)
        #model = load_model(model_path)
        model.load_weights(model_path)
    else:
        model.load_weights(model_file)

    # Load scaler.
    if calc_log:
        scaler_path = os.path.join(workspace, "packed_features", "spectrogram",
                                   "train", "%ddb" % int(tr_snr), "scaler.p")
        scaler = pickle.load(open(scaler_path, 'rb'))

    # Load test data.
    feat_dir = os.path.join(workspace, "features", "spectrogram", "test",
                            "%ddb" % int(te_snr))
    names = os.listdir(feat_dir)

    for (cnt, na) in enumerate(names):
        # Load feature.
        feat_path = os.path.join(feat_dir, na)
        data = cPickle.load(open(feat_path, 'rb'))
        [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data
        mixed_x = np.abs(mixed_cmplx_x)

        # Process data.
        n_pad = (n_concat - 1) / 2
        mixed_x = pp_data.pad_with_border(mixed_x, n_pad)
        if calc_log:
            mixed_x = pp_data.log_sp(mixed_x)
            #speech_x = pp_data.log_sp(speech_x)
        else:
            mixed_x = mixed_x
            #speech_x = speech_x

        # Scale data.
        if calc_log:
            mixed_x = pp_data.scale_on_2d(mixed_x, scaler)
            #speech_x = pp_data.scale_on_2d(speech_x, scaler)
        else:
            mixed_x_max = np.max(mixed_x)
            print("max of tr_x:", mixed_x_max)
            mixed_x = mixed_x / mixed_x_max

            speech_x_max = np.max(speech_x)
            print("max of speech_x:", speech_x_max)
            speech_x = speech_x / speech_x_max

        # Cut input spectrogram to 3D segments with n_concat.
        mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1)

        # Predict.
        if False:
            print(mixed_x_3d)
        pred = model.predict(mixed_x_3d)
        print(cnt, na)
        if False:
            print("pred")
            print(pred)
            print("speech")
            print(speech_x)

        # Inverse scale.
        if calc_log:
            mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler)
            #speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler)
            pred = pp_data.inverse_scale_on_2d(pred, scaler)
        else:
            mixed_x = mixed_x * mixed_x_max
            #speech_x = speech_x * 16384
            pred = pred * mixed_x_max

        # Debug plot.
        if args.visualize:
            fig, axs = plt.subplots(3, 1, sharex=False)
            axs[0].matshow(mixed_x.T,
                           origin='lower',
                           aspect='auto',
                           cmap='jet')
            #axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet')
            axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet')
            axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr))
            axs[1].set_title("Clean speech log spectrogram")
            axs[2].set_title("Enhanced speech log spectrogram")
            for j1 in xrange(3):
                axs[j1].xaxis.tick_bottom()
            plt.tight_layout()
            plt.show()

        # Recover enhanced wav.
        if calc_log:
            pred_sp = np.exp(pred)
        else:
            #gv = 0.025
            #pred_sp = np.maximum(0,pred - gv)
            pred_sp = pred

        if False:
            pred_sp = mixed_x[3:-3]

        s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming)
        s *= np.sqrt((np.hamming(n_window)**2
                      ).sum())  # Scaler for compensate the amplitude
        # change after spectrogram and IFFT.

        # Write out enhanced wav.
        out_path = os.path.join(workspace, "enh_wavs", "test",
                                "%ddb" % int(te_snr), "%s.enh.wav" % na)
        pp_data.create_folder(os.path.dirname(out_path))
        pp_data.write_audio(out_path, s, fs)
        # Write out enhanced pcm 8K pcm_s16le.
        out_pcm_path = os.path.join(workspace, "enh_wavs", "test",
                                    "%ddb" % int(te_snr), "%s.enh.pcm" % na)
        cmd = ' '.join([
            "./ffmpeg -y -i ", out_path,
            " -f s16le -ar 8000 -ac 1 -acodec pcm_s16le ", out_pcm_path
        ])
        os.system(cmd)

        # Write out webrtc-denoised enhanced pcm 8K pcm_s16le.
        ns_out_pcm_path = os.path.join(workspace, "ns_enh_wavs", "test",
                                       "%ddb" % int(te_snr),
                                       "%s.ns_enh.pcm" % na)
        ns_out_wav_path = os.path.join(workspace, "ns_enh_wavs", "test",
                                       "%ddb" % int(te_snr),
                                       "%s.ns_enh.wav" % na)
        pp_data.create_folder(os.path.dirname(ns_out_pcm_path))
        cmd = ' '.join(["./ns", out_pcm_path, ns_out_pcm_path])
        os.system(cmd)
        cmd = ' '.join([
            "./ffmpeg -y -f s16le -ar 8000 -ac 1 -acodec pcm_s16le -i ",
            ns_out_pcm_path, "  ", ns_out_wav_path
        ])
        os.system(cmd)

        cmd = ' '.join(["rm ", out_pcm_path])
        os.system(cmd)
        cmd = ' '.join(["rm ", ns_out_pcm_path])
        os.system(cmd)
示例#3
0
def inference(args):
    """Inference all test data, write out recovered wavs to disk.  推测所有的测试数据,并将恢复的wavs写入磁盘
    
    Args:
      workspace: str, path of workspace. 
      tr_snr: float, training SNR. 
      te_snr: float, testing SNR. 
      n_concat: int, number of frames to concatenta, should equal to n_concat 
          in the training stage. 
      iter: int, iteration of model to load. 
      visualize: bool, plot enhanced spectrogram for debug. 
    """
    print(args)
    workspace = args.workspace
    tr_snr = args.tr_snr
    te_snr = args.te_snr
    n_concat = args.n_concat
    iter = args.iteration
    
    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    scale = True
    
    # Load model. 加载模型
    model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr), "md_%diters.h5" % iter)
    model = load_model(model_path)
    
    # Load scaler. 加载缩放器
    scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "scaler.p")
    scaler = pickle.load(open(scaler_path, 'rb'))
    
    # Load test data. 加载测试数据
    feat_dir = os.path.join(workspace, "features", "spectrogram", "test", "%ddb" % int(te_snr))
    names = os.listdir(feat_dir)

    for (cnt, na) in enumerate(names):
        # Load feature. 加载特征
        feat_path = os.path.join(feat_dir, na)
        data = cPickle.load(open(feat_path, 'rb'))
        [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data
        mixed_x = np.abs(mixed_cmplx_x)
        
        # Process data. 处理数据
        n_pad = (n_concat - 1) / 2
        mixed_x = pp_data.pad_with_border(mixed_x, n_pad)
        mixed_x = pp_data.log_sp(mixed_x)
        speech_x = pp_data.log_sp(speech_x)
        
        # Scale data. 缩放数据
        if scale:
            mixed_x = pp_data.scale_on_2d(mixed_x, scaler)
            speech_x = pp_data.scale_on_2d(speech_x, scaler)
        
        # Cut input spectrogram to 3D segments with n_concat. 使用n_concat将输入频谱图切割为3D段。
        mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1)
        
        # Predict. 预测
        pred = model.predict(mixed_x_3d)
        print(cnt, na)
        
        # Inverse scale. 反预测
        if scale:
            mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler)
            speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler)
            pred = pp_data.inverse_scale_on_2d(pred, scaler)
        
        # Debug plot. 调试图
        if args.visualize:
            fig, axs = plt.subplots(3,1, sharex=False)
            axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet')
            axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet')
            axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet')
            axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr))
            axs[1].set_title("Clean speech log spectrogram")
            axs[2].set_title("Enhanced speech log spectrogram")
            for j1 in xrange(3):
                axs[j1].xaxis.tick_bottom()
            plt.tight_layout()
            plt.show()

        # Recover enhanced wav. 恢复增强的wav。
        pred_sp = np.exp(pred)
        s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming)
        s *= np.sqrt((np.hamming(n_window)**2).sum())   # Scaler for compensate the amplitude
                                                        # change after spectrogram and IFFT. 缩放器用于补偿频谱图和IFFT后的幅度变化。
        
        # Write out enhanced wav. 写出增强的wav。
        out_path = os.path.join(workspace, "enh_wavs", "test", "%ddb" % int(te_snr), "%s.enh.wav" % na)
        pp_data.create_folder(os.path.dirname(out_path))
        pp_data.write_audio(out_path, s, fs)
示例#4
0
def enhance_audio(workspace, speech_to_enhance_dir, train_snr, test_snr,
                  n_concat, iteration):
    """Inference all test data, write out recovered wavs to disk.

    Args:
      workspace: str, path of workspace.
      train_snr: float, training SNR.
      test_snr: float, testing SNR.
      n_concat: int, number of frames to concatenta, should equal to n_concat
          in the training stage.
      iteration: int, iteration of model to load.
      visualize: bool, plot enhanced spectrogram for debug.
    """

    begin_time = time.time()

    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    scale = True

    # Load model.
    model_path = os.path.join(workspace, "models",
                              "{}db".format(int(train_snr)),
                              "md_{}iters.h5".format(iteration))
    model = load_model(model_path)

    # Load scaler.
    scaler_path = os.path.join(workspace, "packed_features", "spectrogram",
                               "train", "{}db".format(int(train_snr)),
                               "scaler.pickle")
    scaler = pickle.load(open(scaler_path, "rb"))

    # Load test data.
    features_dir = os.path.join(workspace, "data", "speech_to_enhance",
                                "features", "spectrogram",
                                "{}db".format(int(test_snr)))

    for sample_id, feature_filename in enumerate(os.listdir(features_dir)):
        # Load feature.
        feature_path = os.path.join(features_dir, feature_filename)
        feature_data = pickle.load(open(feature_path, "rb"))

        mixed_audio_complex_spectrogram, audio_name = feature_data
        mixed_audio_spectrogram = np.abs(mixed_audio_complex_spectrogram)

        # Process data.
        n_pad = (n_concat - 1) / 2
        mixed_audio_spectrogram = audio_utils.pad_with_border(
            mixed_audio_spectrogram, n_pad)
        mixed_audio_spectrogram = audio_utils.log_sp(mixed_audio_spectrogram)

        # Scale data.
        if scale:
            mixed_audio_spectrogram = audio_utils.scale_on_2d(
                mixed_audio_spectrogram, scaler)

        # Cut input spectrogram to 3D segments with n_concat.
        mixed_audio_spectrogram_3d = audio_utils.mat_2d_to_3d(
            mixed_audio_spectrogram, agg_num=n_concat, hop=1)

        # Predict.
        prediction = model.predict(mixed_audio_spectrogram_3d)
        print("Sample id: {}. sample name: {}".format(sample_id, audio_name))

        # Inverse scale.
        if scale:
            prediction = audio_utils.inverse_scale_on_2d(prediction, scaler)

        # Recover enhanced wav.
        prediction_spectrogram = np.exp(prediction)
        recovered_wave = recover_wav(prediction_spectrogram,
                                     mixed_audio_complex_spectrogram,
                                     n_overlap, np.hamming)

        # Scaler for compensate the amplitude change after spectrogram and IFFT.
        recovered_wave *= np.sqrt((np.hamming(n_window)**2).sum())

        # Write out enhanced wav.
        enhanced_audio_filename = "{}.enh.wav".format(audio_name)
        enhanced_audio_dir = os.path.join(workspace, "data",
                                          "speech_to_enhance", "enhanced_wavs",
                                          "{}db".format(int(test_snr)))

        create_directory(enhanced_audio_dir)
        audio_utils.write_audio(
            os.path.join(enhanced_audio_dir, enhanced_audio_filename),
            recovered_wave, fs)

    print()
    print("Inference time: {}".format(time.time() - begin_time))
    print()
示例#5
0
def inference(args):
    """Inference all test data, write out recovered wavs to disk. 
    
    Args:
      workspace: str, path of workspace. 
      tr_snr: float, training SNR. 
      te_snr: float, testing SNR. 
      n_concat: int, number of frames to concatenta, should equal to n_concat 
          in the training stage. 
      iter: int, iteration of model to load. 
      visualize: bool, plot enhanced spectrogram for debug. 
    """
    print(args)
    workspace = args.workspace
    n_concat = args.n_concat
    iter = args.iteration
    dir_name = args.dir_name
    model_name = args.model_name

    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    scale = True

    tr_enh = args.tr_enh

    # Load model.
    model_dir = os.path.join(workspace, "models", model_name)
    with tf.Session() as sess:

        model = DNN(sess, 0.0, 1, (n_concat, int(n_window / 2 + 1)),
                    int(n_window / 2 + 1))
        model.build()
        saver = tf.train.Saver()

        ckpt = tf.train.latest_checkpoint(model_dir)
        saver.restore(sess, ckpt)

        # saver.restore(sess, ckpt.model_checkpoint_path)

        # model_path = os.path.join(model_dir, "md_%diters.h5" % iter)
        # model = load_model(model_path)

        # Load scaler.
        scaler_path = os.path.join(workspace, "packed_features", "spectrogram",
                                   "train", "REVERB_tr_cut", "scaler.p")
        scaler = pickle.load(open(scaler_path, 'rb'))

        # Load test data.
        feat_dir = os.path.join(workspace, "features", "spectrogram", tr_enh,
                                dir_name)
        names = os.listdir(feat_dir)

        for (cnt, na) in enumerate(names):
            # Load feature.
            feat_path = os.path.join(feat_dir, na)
            data = pickle.load(open(feat_path, 'rb'))
            [mixed_cmplx_x, speech_x, na] = data
            mixed_x = np.abs(mixed_cmplx_x)

            # Process data.
            n_pad = (n_concat - 1) / 2
            mixed_x = pp_data.pad_with_border(mixed_x, n_pad)
            mixed_x = pp_data.log_sp(mixed_x)
            speech_x = pp_data.log_sp(speech_x)

            # Scale data.
            if scale:
                mixed_x = pp_data.scale_on_2d(mixed_x, scaler)
                speech_x = pp_data.scale_on_2d(speech_x, scaler)

            # Cut input spectrogram to 3D segments with n_concat.
            mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1)

            # Predict.
            pred = sess.run([model.enhanced_outputs],
                            feed_dict={model.x_noisy: mixed_x_3d
                                       })  # model.predict(mixed_x_3d)
            pred = np.reshape(pred, (-1, int(n_window / 2 + 1)))
            print(cnt, na)

            # Inverse scale.
            if scale:
                mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler)
                speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler)
                pred = pp_data.inverse_scale_on_2d(pred, scaler)

            # Debug plot.
            if args.visualize:
                fig, axs = plt.subplots(3, 1, sharex=False)
                axs[0].matshow(mixed_x.T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                axs[1].matshow(speech_x.T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                axs[2].matshow(pred.T,
                               origin='lower',
                               aspect='auto',
                               cmap='jet')
                # axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr))
                axs[1].set_title("Clean speech log spectrogram")
                axs[2].set_title("Enhanced speech log spectrogram")
                for j1 in xrange(3):
                    axs[j1].xaxis.tick_bottom()
                plt.tight_layout()
                plt.show()

            # Recover enhanced wav.
            pred_sp = np.exp(pred)

            s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming)
            s *= np.sqrt((np.hamming(n_window)**2
                          ).sum())  # Scaler for compensate the amplitude
            # change after spectrogram and IFFT.

            # Write out enhanced wav.
            out_path = os.path.join(workspace, "enh_wavs", "test", dir_name,
                                    "%s.enh.wav" % na)
            pp_data.create_folder(os.path.dirname(out_path))
            pp_data.write_audio(out_path, s, fs)
    test_padding = np.concatenate((test_spec_norm, padding))
    # print(test_padding.shape)
    test_padding_var = torch.from_numpy(test_padding).type(torch.FloatTensor)
    start = timeit.default_timer()
    predictions = model(test_padding_var)
    print(predictions)
    stop = timeit.default_timer()
    print('Time: ', stop - start)
    # print(predictions.shape)
    pre_removed = predictions[:test_spec_norm.shape[0]]
    #  print(pre_removed.shape)
    pre_inversed = inverse_scale_on_2d(pre_removed.cpu().detach().numpy(),
                                       scaler_label)
    #   print(np.exp(pre_inversed))
    pre_inversed = np.exp(pre_inversed)
    s = recover_wav(pre_inversed, mixed_complx_x, n_overlap, np.hamming)
    s *= np.sqrt((np.hamming(n_window)**2).sum())
    write_audio("gpu_enh.wav", s, 16000)
    pre_fpga = np.loadtxt('fpga_output')
    print(pre_fpga)
    pre_fpga_inversed = inverse_scale_on_2d(pre_fpga, scaler_label)
    pre_fpga_inversed = np.exp(pre_fpga_inversed)
    s_fpga = recover_wav(pre_fpga_inversed, mixed_complx_x, n_overlap,
                         np.hamming)
    s_fpga *= np.sqrt((np.hamming(n_window)**2).sum())
    write_audio("fpga_enh.wav", s_fpga, 16000)
    print(pre_fpga.shape)
# pre_inversed = pre_inversed.T

#plt.matshow(pre_inversed)
#plt.savefig('test.png')
示例#7
0
def predict_folder(input_file_folder: object, output_file_folder: object) -> object:
    # Load model.
    data_type = "test"
    model_path = os.path.join(conf1.model_dir, "md_%diters.h5" % conf1.iterations)
    model = load_model(model_path)

    # Load scaler.
    # if scale:
    scaler_path = os.path.join(conf1.packed_feature_dir, data_type, "scaler.p")
    scaler = pickle.load(open(scaler_path, 'rb'))

    # Load test data.
    # names = os.listdir(input_file_folder)

    names = [f for f in sorted(os.listdir(input_file_folder)) if f.startswith("mix")]

    mixed_all = []
    pred_all = []
    for (cnt, na) in enumerate(names):
        # Load feature.
        file_path = os.path.join(input_file_folder, na)
        (a, _) = pp.read_audio(file_path)
        mixed_complex = pp.calc_sp(a, 'complex')


        mixed_x = np.abs(mixed_complex)

        # Process data.
        n_pad = (conf1.n_concat - 1) / 2
        mixed_x = pp.pad_with_border(mixed_x, n_pad)
        mixed_x = pp.log_sp(mixed_x)
        # speech_x = dnn1_train.log_sp(speech_x)

        # Scale data.
        # if scale:
        mixed_x = pp.scale_on_2d(mixed_x, scaler)

        # Cut input spectrogram to 3D segments with n_concat.
        mixed_x_3d = pp.mat_2d_to_3d(mixed_x, agg_num=conf1.n_concat, hop=1)


        # Predict.
        pred = model.predict(mixed_x_3d)
        print(cnt, na)

        # Inverse scale.
        #if scale:
        mixed_x = pp.inverse_scale_on_2d(mixed_x, scaler)
        # speech_x = dnn1_train.inverse_scale_on_2d(speech_x, scaler)
        pred = pp.inverse_scale_on_2d(pred, scaler)

        # Debug plot.
        if visualize_plot:
            visualize(mixed_x, pred)

        mixed_all.append(mixed_complex)
        pred_all.append(real_to_complex(pred, mixed_complex))


        # Recover enhanced wav.
        pred_sp = np.exp(pred)
        s = recover_wav(pred_sp, mixed_complex, conf1.n_overlap, np.hamming)
        s *= np.sqrt((np.hamming(conf1.n_window) ** 2).sum())  # Scaler for compensate the amplitude
        # change after spectrogram and IFFT.

        # Write out enhanced wav.

        pp.create_folder(output_file_folder)
        audio_path = os.path.join(output_file_folder, "enh_%s" % na)
        pp.write_audio(audio_path, s, conf1.sample_rate)

    return mixed_all, pred_all