Пример #1
0
def main_denoising(wav_dir, out_dir, gpu_id, truncate_minutes):

    if not os.path.exists(wav_dir):
        raise RuntimeError("cannot locate the original dictionary !")

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    # print "Since the some clips in DHAHRD are long, it's better to split the long sentences to several sub-clips, in case of causing GPU memory problem during LSTM inference.\n "

    # loading global MVN statistics
    glo_mean_var = sio.loadmat('./model/global_mvn_stats.mat')
    mean = glo_mean_var['global_mean']
    var = glo_mean_var['global_var']

    wav_files = [os.path.join(wav_dir, line) for line in os.listdir(wav_dir)]

    # feature_extraction
    for wav in wav_files:
        if wav.endswith('.wav'):
            rate, wav_data = wav_io.read(wav)
            sample_length = wav_data.size

            # apply peak-normalization first.
            #pdb.set_trace()
            wav_data = utils.peak_normalization(wav_data)

            chunk_length = truncate_minutes * rate * 60
            total_chunks = int(
                math.ceil(float(sample_length) / float(chunk_length)))
            se_data_total = np.array([], dtype=np.int16)
            #pdb.set_trace()
            for i in range(1, total_chunks + 1):

                if (i == 1
                        and total_chunks == 1):  # if it only contains 1 chunk
                    temp = wav_data
                elif (i == total_chunks):  # if it's the last chunk
                    temp = wav_data[(i - 1) * chunk_length - 1:]
                else:
                    if (i == 1):
                        temp = wav_data[0:chunk_length - 1]
                    else:
                        temp = wav_data[((i - 1) * chunk_length -
                                         1):(i * chunk_length - 1)]
                print("Current processing wav: %s, segment: %d/%d ." %
                      (wav, i, total_chunks))

                if temp.shape[
                        0] < 256:  # if it's not enough for one half of frame
                    #pdb.set_trace()
                    data_se = temp  # do not process
                    se_data_total = np.append(se_data_total,
                                              data_se.astype(np.int16))
                    continue

                # Process the audio in separate temporary files
                noisy_normed_lps = 'temp_normed.lps'
                enhanced_wav = 'temp_se.wav'

                # extract lps feature from waveform
                noisy_htkdata = utils.wav2logspec(
                    temp, window=np.hamming(512))  ##!!!!!!!!!!!!

                # Do MVN before decoding
                normed_noisy = (noisy_htkdata - mean) / var
                utils.writeHtk(noisy_normed_lps,
                               normed_noisy,
                               sampPeriod=160000,
                               parmKind=9)

                # make the decoding list in CNTK-determined format
                cntk_len = noisy_htkdata.shape[0] - 1
                flist = open('./test_normed.scp', 'w')
                flist.write('test.normedlsp=temp_normed.lps[0,' +
                            str(cntk_len) + "]\n")
                flist.close()

                # Start CNTK model-decoding
                os.system('python decode_model.py  %d ' % (gpu_id))

                # Read decoded data
                SE_mat = sio.loadmat(
                    'enhanced_norm_fea_mat/test.normedlsp.mat')
                IRM = SE_mat['SE']
                # Directly mask the original feature
                masked_lps = noisy_htkdata + np.log(IRM)

                wave_recon = utils.logspec2wav(masked_lps,
                                               temp,
                                               window=np.hamming(512),
                                               nperseg=512,
                                               noverlap=256)
                wav_io.write(enhanced_wav, 16000, np.asarray(wave_recon))

                # # # Back to time domain
                rate, data_se = wav_io.read(enhanced_wav)
                se_data_total = np.append(se_data_total, data_se)

            output_wav = os.path.join(out_dir, wav.split('/')[-1])
            wav_io.write(output_wav, 16000, np.asarray(se_data_total))
            print("Processing wav: %s, done ." % (wav))
Пример #2
0
def denoise_wav(src_wav_file, dest_wav_file, global_mean, global_var, use_gpu,
                gpu_id, truncate_minutes):
    """Apply speech enhancement to audio in WAV file.

    Parameters
    ----------
    src_wav_file : str
        Path to WAV to denosie.

    dest_wav_file : str
        Output path for denoised WAV.

    global_mean : ndarray, (n_feats,)
        Global mean for LPS features. Used for CMVN.

    global_var : ndarray, (n_feats,)
        Global variances for LPS features. Used for CMVN.

    use_gpu : bool, optional
        If True and GPU is available, perform all processing on GPU.
        (Default: True)

    gpu_id : int, optional
         Id of GPU on which to do computation.
         (Default: 0)

    truncate_minutes: float
        Maximimize size in minutes to process at a time. The enhancement will
        be done on chunks of audio no greather than ``truncate_minutes``
        minutes duration.
    """
    # Read noisy audio WAV file. As scipy.io.wavefile.read is FAR faster than
    # librosa.load, we use the former.
    rate, wav_data = wav_io.read(src_wav_file)

    # Apply peak-normalization.
    wav_data = utils.peak_normalization(wav_data)

    # Perform denoising in chunks of size chunk_length samples.
    chunk_length = int(truncate_minutes * rate * 60)
    total_chunks = int(math.ceil(wav_data.size / chunk_length))
    data_se = []  # Will hold enhanced audio data for each chunk.
    for i in range(1, total_chunks + 1):
        tmp_dir = tempfile.mkdtemp()
        try:
            # Get samples for this chunk.
            bi = (i - 1) * chunk_length  # Index of first sample of this chunk.
            ei = bi + chunk_length  # Index of last sample of this chunk + 1.
            temp = wav_data[bi:ei]
            print('Processing file: %s, segment: %d/%d.' %
                  (src_wav_file, i, total_chunks))

            # Skip denoising if chunk is too short.
            if temp.shape[0] < WL2:
                data_se.append(temp)
                continue

            # Determine paths to the temporary files to be created.
            noisy_normed_lps_fn = os.path.join(tmp_dir, 'noisy_normed_lps.htk')
            noisy_normed_lps_scp_fn = os.path.join(tmp_dir,
                                                   'noisy_normed_lps.scp')
            irm_fn = os.path.join(tmp_dir, 'irm.mat')

            # Extract LPS features from waveform.
            noisy_htkdata = utils.wav2logspec(temp, window=np.hamming(WL))

            # Do MVN before decoding.
            normed_noisy = (noisy_htkdata - global_mean) / global_var

            # Write features to HTK binary format making sure to also
            # create a script file.
            utils.write_htk(noisy_normed_lps_fn,
                            normed_noisy,
                            samp_period=SR,
                            parm_kind=9)
            cntk_len = noisy_htkdata.shape[0] - 1
            with open(noisy_normed_lps_scp_fn, 'w') as f:
                f.write('irm=%s[0,%d]\n' % (noisy_normed_lps_fn, cntk_len))

            # Apply CNTK model to determine ideal ratio mask (IRM), which will
            # be output to the temp directory as irm.mat. In order to avoid a
            # memory leak, must do this in a separate process which we then
            # kill.
            decode_model(noisy_normed_lps_scp_fn, tmp_dir, NFREQS, use_gpu,
                         gpu_id)

            # Read in IRM and directly mask the original LPS features.
            irm = sio.loadmat(irm_fn)['IRM']
            masked_lps = noisy_htkdata + np.log(irm)

            # Reconstruct audio.
            wave_recon = utils.logspec2wav(masked_lps,
                                           temp,
                                           window=np.hamming(WL),
                                           n_per_seg=WL,
                                           noverlap=WL2)
            data_se.append(wave_recon)
        finally:
            shutil.rmtree(tmp_dir)
    data_se = np.concatenate(data_se)
    wav_io.write(dest_wav_file, SR, data_se)
Пример #3
0
def denoise_wav(src_wav_file, dest_wav_file, global_mean, global_var, use_gpu,
                truncate_minutes, mode, stage_select):
    """Apply speech enhancement to audio in WAV file.

    Parameters
    ----------
    src_wav_file : str
        Path to WAV to denosie.

    dest_wav_file : str
        Output path for denoised WAV.

    global_mean : ndarray, (n_feats,)
        Global mean for LPS features. Used for CMVN.

    global_var : ndarray, (n_feats,)
        Global variances for LPS features. Used for CMVN.

    use_gpu : bool, optional
        If True and GPU is available, perform all processing on GPU.
        (Default: True)

    truncate_minutes: float
        Maximimize size in minutes to process at a time. The enhancement will
        be done on chunks of audio no greather than ``truncate_minutes``
        minutes duration.
    """
    # Read noisy audio WAV file. As scipy.io.wavefile.read is FAR faster than
    # librosa.load, we use the former.
    rate, wav_data = wav_io.read(src_wav_file)

    if mode == 1:
        print(
            "###Selecting the estimated ideal-ratio-masks in mode 1 (more conservative).###"
        )
    elif mode == 2:
        print(
            "###Selecting the estimated log-power-spec features in mode 2 (more agressive).###"
        )
    elif mode == 3:
        print(
            "###Selecting both estimated IRM and LPS outputs with equal weights in mode 3 (trade-off).###"
        )

    # Apply peak-normalization.
    wav_data = utils.peak_normalization(wav_data)

    # Perform denoising in chunks of size chunk_length samples.
    chunk_length = int(truncate_minutes * rate * 60)
    total_chunks = int(math.ceil(wav_data.size / chunk_length))
    data_se = []  # Will hold enhanced audio data for each chunk.

    model_pth = os.path.join(HERE, '1000h_se.pth')
    if not os.path.exists(model_pth):
        cmd = "cp  {} {} ".format(
            '/export/fs01/jsalt19/leisun/speech_enhancement/speech_denoising_pytorch/model/1000h_se.pth',
            model_pth)
        os.system(cmd)

    nnet = LSTM_SE_PL_Dense_MTL(257, 7, 1024, 3, 257, 'false')
    nnet.load_state_dict(torch.load(model_pth))
    nnet = nnet.cuda()
    nnet.eval()

    for i in range(1, total_chunks + 1):
        # Get samples for this chunk.
        bi = (i - 1) * chunk_length  # Index of first sample of this chunk.
        ei = bi + chunk_length  # Index of last sample of this chunk + 1.
        temp = wav_data[bi:ei]
        print('Processing file: %s, segment: %d/%d.' %
              (src_wav_file, i, total_chunks))

        # Skip denoising if chunk is too short.
        if temp.shape[0] < WL2:
            data_se.append(temp)
            continue

        # Extract LPS features from waveform.
        noisy_htkdata = utils.wav2logspec(temp, window=np.hamming(WL))
        # frame expandation in the input
        noisy_htkdata_expand = utils.expand_frames(noisy_htkdata, [3, 3])

        input = torch.from_numpy(
            (noisy_htkdata_expand - global_mean) / (global_var))
        lps_outputs, irm_outputs = nnet(
            torch.unsqueeze(input, 1).cuda().float())

        if mode == 1:
            print(" Use the estimated LPS.")
            recovered_lps = noisy_htkdata + np.log(
                torch.squeeze(
                    irm_outputs[stage_select - 1]).cpu().data.numpy())
        elif mode == 2:
            print(" Use the estimated IRM.")
            recovered_lps = torch.squeeze(lps_outputs[stage_select - 1]).cpu(
            ).data.numpy() * global_var[:257] + global_mean[:257]
        elif mode == 3:
            print(" Use the fusion of estimated LPS and IRM.")
            recovered_lps = 0.5 * (noisy_htkdata + np.log(
                torch.squeeze(irm_outputs[stage_select - 1]).cpu().data.numpy(
                ))) + 0.5 * (torch.squeeze(lps_outputs[stage_select - 1]).cpu(
                ).data.numpy() * global_var[:257] + global_mean[:257])

        # Reconstruct audio.
        wave_recon = utils.logspec2wav(recovered_lps,
                                       temp,
                                       window=np.hamming(WL),
                                       n_per_seg=WL,
                                       noverlap=WL2)
        data_se.append(wave_recon)

    data_se = [x.astype(np.int16, copy=False) for x in data_se]
    data_se = np.concatenate(data_se)
    wav_io.write(dest_wav_file, SR, data_se)
Пример #4
0
def denoise_wav(src_wav_file, dest_wav_file, global_mean, global_var, use_gpu,
                gpu_id, truncate_minutes, mode, model_select, stage_select):
    """Apply speech enhancement to audio in WAV file.

    Parameters
    ----------
    src_wav_file : str
        Path to WAV to denosie.

    dest_wav_file : str
        Output path for denoised WAV.

    global_mean : ndarray, (n_feats,)
        Global mean for LPS features. Used for CMVN.

    global_var : ndarray, (n_feats,)
        Global variances for LPS features. Used for CMVN.

    use_gpu : bool, optional
        If True and GPU is available, perform all processing on GPU.
        (Default: True)

    gpu_id : int, optional
         Id of GPU on which to do computation.
         (Default: 0)

    truncate_minutes: float
        Maximimize size in minutes to process at a time. The enhancement will
        be done on chunks of audio no greather than ``truncate_minutes``
        minutes duration.
    """
    # Read noisy audio WAV file. As scipy.io.wavefile.read is FAR faster than
    # librosa.load, we use the former.
    rate, wav_data = wav_io.read(src_wav_file)

    if mode == 1:
        print(
            "###Selecting the estimated ideal-ratio-masks in mode 1 (more conservative).###"
        )
    elif mode == 2:
        print(
            "###Selecting the estimated log-power-spec features in mode 2 (more agressive).###"
        )
    elif mode == 3:
        print(
            "###Selecting both estimated IRM and LPS outputs with equal weights in mode 3 (trade-off).###"
        )

    print("Using the pre-trained {} speech enhancement model.".format(
        model_select))

    # Apply peak-normalization.
    wav_data = utils.peak_normalization(wav_data)

    # Perform denoising in chunks of size chunk_length samples.
    chunk_length = int(truncate_minutes * rate * 60)
    total_chunks = int(math.ceil(wav_data.size / chunk_length))
    data_se = []  # Will hold enhanced audio data for each chunk.
    for i in range(1, total_chunks + 1):
        tmp_dir = tempfile.mkdtemp()
        try:
            # Get samples for this chunk.
            bi = (i - 1) * chunk_length  # Index of first sample of this chunk.
            ei = bi + chunk_length  # Index of last sample of this chunk + 1.
            temp = wav_data[bi:ei]
            print('Processing file: %s, segment: %d/%d.' %
                  (src_wav_file, i, total_chunks))

            # Skip denoising if chunk is too short.
            if temp.shape[0] < WL2:
                data_se.append(temp)
                continue

            # Determine paths to the temporary files to be created.
            noisy_normed_lps_fn = os.path.join(tmp_dir, 'noisy_normed_lps.htk')
            noisy_normed_lps_scp_fn = os.path.join(tmp_dir,
                                                   'noisy_normed_lps.scp')
            outputs_fn = os.path.join(tmp_dir, 'irm.mat')

            # Extract LPS features from waveform.
            noisy_htkdata = utils.wav2logspec(temp, window=np.hamming(WL))

            # Do MVN before decoding.
            normed_noisy = (noisy_htkdata - global_mean) / global_var

            # Write features to HTK binary format making sure to also
            # create a script file.
            #utils.write_htk(
            #     noisy_normed_lps_fn, normed_noisy, samp_period=SR,
            #    parm_kind=9)

            if model_select.lower() == '400h':
                utils.write_htk(noisy_normed_lps_fn,
                                normed_noisy,
                                samp_period=SR,
                                parm_kind=9)
            elif model_select.lower() == '1000h':
                utils.write_htk(
                    noisy_normed_lps_fn,
                    noisy_htkdata,
                    samp_period=SR,
                    parm_kind=9
                )  ### The 1000h model already integrates MVN inside itself.

            cntk_len = noisy_htkdata.shape[0] - 1
            with open(noisy_normed_lps_scp_fn, 'w') as f:
                f.write('irm=%s[0,%d]\n' % (noisy_normed_lps_fn, cntk_len))

            # Apply CNTK model to determine ideal ratio mask (IRM), which will
            # be output to the temp directory as irm.mat. In order to avoid a
            # memory leak, must do this in a separate process which we then
            # kill.
            #def decode_model(features_file, irm_mat_dir, feature_dim, use_gpu=True,
            #                gpu_id=0, mode=1, model_select='400h', stage_select=3):

            p = Process(target=decode_model,
                        args=(noisy_normed_lps_scp_fn, tmp_dir, NFREQS,
                              use_gpu, gpu_id, mode, model_select,
                              stage_select))
            p.start()
            p.join()
            if p.exception:
                e, tb = p.exception
                raise type(e)(tb)

            # Read in IRM and directly mask the original LPS features.
            irm = sio.loadmat(outputs_fn)['IRM']
            lps = sio.loadmat(outputs_fn)['LPS']

            if mode == 1:
                recovered_lps = noisy_htkdata + np.log(irm)
            elif mode == 2:
                recovered_lps = (lps * global_var) + global_mean
            elif mode == 3:
                recovered_lps = 0.5 * (noisy_htkdata + np.log(irm)) + 0.5 * (
                    (lps * global_var) + global_mean)

            # Reconstruct audio.
            wave_recon = utils.logspec2wav(recovered_lps,
                                           temp,
                                           window=np.hamming(WL),
                                           n_per_seg=WL,
                                           noverlap=WL2)
            data_se.append(wave_recon)
        finally:
            shutil.rmtree(tmp_dir)
    data_se = [x.astype(np.int16, copy=False) for x in data_se]
    data_se = np.concatenate(data_se)
    wav_io.write(dest_wav_file, SR, data_se)