Пример #1
0
def main(argv):
    fbank_mx = features.mel_fbank_mx(winlen_nfft=WINDOWSIZE / SOURCERATE,
                                     fs=fs,
                                     NUMCHANS=NUMCHANS,
                                     LOFREQ=LOFREQ,
                                     HIFREQ=HIFREQ)

    scp_list = sys.argv[1]
    vad_dir = sys.argv[2]
    wav_dir = sys.argv[3]
    ubm_file = sys.argv[4]
    v_file = sys.argv[5]
    out_dir = sys.argv[6]

    print 'Loading UBM from', ubm_file
    ubm_weights, ubm_means, ubm_covs = load_ubm(ubm_file)
    GMM = gmm.gmm_eval_prep(ubm_weights, ubm_means, ubm_covs)

    numG = ubm_means.shape[0]
    dimF = ubm_means.shape[1]

    # normalization of statistics - precomputing matrices
    if ubm_covs.shape[1] == dimF:
        ubm_norm = 1 / np.sqrt(ubm_covs);

    print 'Loading T matrix from ', v_file, '...'
    v = np.loadtxt(v_file, dtype=np.float32)

    print 'Computing MVVT ...'
    MVVT = iv.compute_VtV(v, numG)

    print 'Loading list of files to process from ' + scp_list
    seg_list = np.atleast_1d(np.loadtxt(scp_list, dtype=object))

    # extract all sub-dir names
    for dir in set(map(os.path.dirname, seg_list)):
        mkdir_p(out_dir + '/' + dir)

    # go over the scp and process the audio files
    for ii, fn in enumerate(seg_list, 1):
        try:
            print 'Processing ', ii, '/', len(seg_list), fn
            np.random.seed(777)

            wav_file = wav_dir + '/' + fn + '.wav'
            raw_file = wav_dir + '/' + fn + '.raw'
            lab_file = vad_dir + '/' + fn + '.lab.gz'
            ivec_out_file = out_dir + '/' + fn + '.ivec'

            if os.path.isfile(wav_file):
                print '  Reading wave file from ' + wav_file,
                rate, sig = spiowav.read(wav_file)

                if rate != 8000:
                    raise Exception(
                        'The input file ' + wav_file + ' is expected to be in 8000 Hz sampling rate, but ' + repr(
                            rate) + ' Hz detected')

            else:
                print '  Reading raw 8000Hz, 16bit-s, 1c,  file from ' + raw_file,
                sig = np.fromfile(raw_file, dtype='int16')

            print '[t=' + repr(len(sig) / fs) + ' seconds, fs=' + repr(fs) + 'Hz, n=' + repr(len(sig)) + ' samples]'

            if ADDDITHER > 0.0:
                print '  Adding dither'
                sig = features.add_dither(sig, ADDDITHER)

            print '  Extracting features',
            fea = features.mfcc_htk(sig,
                                    window=WINDOWSIZE / SOURCERATE,
                                    noverlap=(WINDOWSIZE - TARGETRATE) / SOURCERATE,
                                    fbank_mx=fbank_mx,
                                    _0='first',
                                    NUMCEPS=NUMCEPS,
                                    RAWENERGY=RAWENERGY,
                                    PREEMCOEF=PREEMCOEF,
                                    CEPLIFTER=CEPLIFTER,
                                    ZMEANSOURCE=ZMEANSOURCE,
                                    ENORMALISE=ENORMALISE,
                                    ESCALE=0.1,
                                    SILFLOOR=50.0,
                                    USEHAMMING=True)

            print '[n=' + repr(len(fea)) + ' frames]'

            print '  Adding derivatives'
            # [add_deriv] step
            fea = features.add_deriv(fea, (deltawindow, accwindow))

            print '  Reshaping to SFeaCat convention'
            # [reshape] step
            fea = fea.reshape(fea.shape[0], 3, -1).transpose((0, 2, 1)).reshape(fea.shape[0],
                                                                                -1)  # re-order coeffs like SFeaCut

            if vad_dir == "auto":
                print '  Computing VAD '
                vad, n_regions, n_frames = compute_vad(sig, win_length=WINDOWSIZE / SOURCERATE,
                                                       win_overlap=(WINDOWSIZE - TARGETRATE) / SOURCERATE)[:len(fea)]
            else:
                print '  Loading VAD definition from ' + lab_file
                vad, n_regions, n_frames = load_vad_lab_as_bool_vec(lab_file)[:len(fea)]

            print '  Applying VAD [#frames=' + repr(n_frames) + ', #regions=' + repr(n_regions) + ']'
            fea = fea[vad, ...]

            if len(fea) < 3:
                raise NoVadException('Too few frames left: ' + str(len(fea)))

            print '  Applying floating CMVN'
            fea = features.cmvn_floating(fea, cmvn_lc, cmvn_rc, unbiased=True)

            n_data, d_data = fea.shape

            l = 0;
            lc = 0
            n = np.zeros((numG), dtype=np.float32)
            f = np.zeros((numG, dimF), dtype=np.float32)

            print '  Computing stats ...',
            # Note that we compute the stats in in sub-chunks due to memory optimization
            #
            seq_data = split_seq(range(n_data), 1000)
            for i in range(len(seq_data)):
                dd = fea[seq_data[i], :]
                l1, n1, f1 = gmm.gmm_eval(dd, GMM, return_accums=1)
                l = l + l1.sum()
                lc = lc + l1.shape[0]
                n = n + n1;
                f = f + f1;

            print '[avg llh=' + repr(l / lc) + ', #frames=' + repr(n_data) + ']'

            n, f = normalize_stats(n, f, ubm_means, ubm_norm)

            f = row(f.astype(v.dtype))
            n = row(n.astype(v.dtype))

            print '  Computing i-vector'
            w = iv.estimate_i(n, f, v, MVVT).T

            # write it to the disk
            print '  Saving ivec to:', ivec_out_file
            # np.savetxt(ivec_out_file, w.ravel(), newline=' ', fmt='%f')
            ivio.write_binary_ivector(ivec_out_file, w.ravel(), n_data / 100.0)

        except NoVadException as e:
            print e
            print "Warning: No features generated for segment: " + fn

        except:
            raise
Пример #2
0
                raise Exception(
                    'The input file ' + wav_file +
                    ' is expected to be in 8000 Hz sampling rate, but ' +
                    repr(rate) + ' Hz detected')

        else:
            print '  Reading raw 8000Hz, 16bit-s, 1c,  file from ' + raw_file,
            sig = np.fromfile(raw_file, dtype='int16')

        print '[t=' + repr(
            len(sig) / fs) + ' seconds, fs=' + repr(fs) + 'Hz, n=' + repr(
                len(sig)) + ' samples]'

        if ADDDITHER > 0.0:
            print '  Adding dither'
            sig = features.add_dither(sig, ADDDITHER)

        print '  Extracting features',
        fea = features.mfcc_htk(sig,
                                window=WINDOWSIZE / SOURCERATE,
                                noverlap=(WINDOWSIZE - TARGETRATE) /
                                SOURCERATE,
                                fbank_mx=fbank_mx,
                                _0='first',
                                NUMCEPS=NUMCEPS,
                                RAWENERGY=RAWENERGY,
                                PREEMCOEF=PREEMCOEF,
                                CEPLIFTER=CEPLIFTER,
                                ZMEANSOURCE=ZMEANSOURCE,
                                ENORMALISE=ENORMALISE,
                                ESCALE=0.1,
Пример #3
0
                            fbank_mx = features.mel_fbank_mx(winlen,
                                                             samplerate,
                                                             NUMCHANS=64,
                                                             LOFREQ=20.0,
                                                             HIFREQ=7600,
                                                             htk_bug=False)
                        else:
                            raise ValueError(
                                f'Only 8kHz and 16kHz are supported. Got {samplerate} instead.'
                            )

                        LC = 150
                        RC = 149

                        np.random.seed(3)  # for reproducibility
                        signal = features.add_dither(
                            (signal * 2**15).astype(int))

                        for segnum in range(len(labs)):
                            seg = signal[labs[segnum, 0]:labs[segnum, 1]]
                            if seg.shape[
                                    0] > 0.01 * samplerate:  # process segment only if longer than 0.01s
                                # Mirror noverlap//2 initial and final samples
                                seg = np.r_[seg[noverlap // 2 - 1::-1], seg,
                                            seg[-1:-winlen // 2 - 1:-1]]
                                fea = features.fbank_htk(seg,
                                                         window,
                                                         noverlap,
                                                         fbank_mx,
                                                         USEPOWER=True,
                                                         ZMEANSOURCE=True)
                                fea = features.cmvn_floating_kaldi(
Пример #4
0
    def process_wav(self, wav_file, mode="ivector", vad_dir="auto"):
        if mode not in ["ivector", "statistics", "mfcc"]:
            return False

        else:
            # all constans are initialized in __init__() method
            # READ WAVE AND COMPUTE IVECTOR
            sig, rate = librosa.load(wav_file)
            #print(librosa.get_duration(sig, rate))
            # wav conversion
            sig, rate = self.wav_conversion(sig, rate)
            #import sounddevice as sd
            #sd.play(sig, rate)

            if rate != 8000:
                raise Exception(
                    'The input file ' + wav_file +
                    ' is expected to be in 8000 Hz sampling rate, but ' +
                    repr(rate) + ' Hz detected')

            # info about singnal printed
            print '[t=' + repr(
                len(sig) / fs) + ' seconds, fs=' + repr(fs) + 'Hz, n=' + repr(
                    len(sig)) + ' samples]'

            if ADDDITHER > 0.0:
                print '  Adding dither'
                sig = features.add_dither(sig, ADDDITHER)

            print '  Extracting features',
            fea = features.mfcc_htk(sig,
                                    window=WINDOWSIZE / SOURCERATE,
                                    noverlap=(WINDOWSIZE - TARGETRATE) /
                                    SOURCERATE,
                                    fbank_mx=fbank_mx,
                                    _0='first',
                                    NUMCEPS=NUMCEPS,
                                    RAWENERGY=RAWENERGY,
                                    PREEMCOEF=PREEMCOEF,
                                    CEPLIFTER=CEPLIFTER,
                                    ZMEANSOURCE=ZMEANSOURCE,
                                    ENORMALISE=ENORMALISE,
                                    ESCALE=0.1,
                                    SILFLOOR=50.0,
                                    USEHAMMING=True)

            print '[n=' + repr(len(fea)) + ' frames]'

            print '  Adding derivatives'
            # [add_deriv] step
            fea = features.add_deriv(fea, (deltawindow, accwindow))

            print '  Reshaping to SFeaCat convention'
            # [reshape] step
            fea = fea.reshape(fea.shape[0], 3, -1).transpose(
                (0, 2, 1)).reshape(fea.shape[0],
                                   -1)  # re-order coeffs like SFeaCut
            if vad_dir == "auto":
                print '  Computing VAD '
                vad, n_regions, n_frames = self.compute_vad(
                    sig,
                    win_length=WINDOWSIZE / SOURCERATE,
                    win_overlap=(WINDOWSIZE - TARGETRATE) /
                    SOURCERATE)[:len(fea)]

                print '  Applying VAD [#frames=' + repr(
                    n_frames) + ', #regions=' + repr(n_regions) + ']'
                fea = fea[0:len(vad), ...]
                fea = fea[vad, ...]

                if len(fea) < 3:
                    raise NoVadException('Too few frames left: ' +
                                         str(len(fea)))

                print '  Applying floating CMVN'
                fea = features.cmvn_floating(fea,
                                             cmvn_lc,
                                             cmvn_rc,
                                             unbiased=True)

                if mode == "mfcc":
                    return fea

                n_data, d_data = fea.shape

                l = 0
                lc = 0
                n = np.zeros((self.numG), dtype=np.float32)
                f = np.zeros((self.numG, self.dimF), dtype=np.float32)

                print '  Computing stats ...',
                # Note that we compute the stats in in sub-chunks due to memory optimization
                #
                seq_data = self.split_seq(range(n_data), 1000)
                for i in range(len(seq_data)):
                    dd = fea[seq_data[i], :]
                    l1, n1, f1 = gmm.gmm_eval(dd, self.GMM, return_accums=1)
                    l = l + l1.sum()
                    lc = lc + l1.shape[0]
                    n = n + n1
                    f = f + f1

                print '[avg llh=' + repr(
                    l / lc) + ', #frames=' + repr(n_data) + ']'

                n, f = self.normalize_stats(n, f, self.ubm_means,
                                            self.ubm_norm)

                f = self.row(f.astype(self.v.dtype))
                n = self.row(n.astype(self.v.dtype))

                if mode == "statistics":
                    return f, n

                print '  Computing i-vector'
                w = iv.estimate_i(n, f, self.v, self.MVVT).T

                print "IVECTOR", w

                if mode == "ivector":
                    return w
Пример #5
0
                                 fs,
                                 NUMCHANS=40,
                                 LOFREQ=20.0,
                                 HIFREQ=7600,
                                 htk_bug=False)
LC = 150
RC = 149

with open(out_seg_fn, "w") as seg_file:
    with open(out_ark_fn, "wb") as ark_file:
        for fn in file_names:
            labs = (
                np.loadtxt(in_lab_dir + "/" + fn + ".lab", usecols=(0, 1)) *
                16000).astype(int)
            signal, samplerate = sf.read(in_flac_dir + "/" + fn + ".flac")
            signal = features.add_dither(
                (signal * 2**(samplerate / 1000 - 1)).astype(int))
            for segnum in range(len(labs)):
                seg = signal[labs[segnum, 0]:labs[segnum, 1]]
                seg = np.r_[
                    seg[noverlap // 2 - 1::-1], seg,
                    seg[-1:-winlen // 2 -
                        1:-1]]  # Mirror noverlap//2 initial and final samples
                fea = features.fbank_htk(seg,
                                         window,
                                         noverlap,
                                         fbank_mx,
                                         USEPOWER=True,
                                         ZMEANSOURCE=True)
                fea = features.cmvn_floating_kaldi(fea,
                                                   LC,
                                                   RC,