示例#1
0
def extract_features(fname, bdir, sox, htk_mfc, mfc_extension, stereo_wav,
        gammatones, spectrograms, filterbanks):
#def extract_features(fname, bdir):
    if fname[-4:] != '.wav':
        return
    rawfname = bdir+'/'+fname[:-4]+'.rawaudio'
    wavfname = bdir+'/'+fname
    tempfname = bdir+'/'+fname[:-4]+'_temp.wav'
    # temp fname with .wav for sox
    mfccfname = bdir+'/'+fname[:-4]+mfc_extension
    if sox:
        shutil.move(wavfname, tempfname)
        call(['sox', tempfname, wavfname])
        #call(['sox', '-G', tempfname, '-r 16k', wavfname])
        # w/o headers, sox uses extension
        shutil.move(tempfname, rawfname)
    if htk_mfc:
        call(['HCopy', '-C', 'wav_config', wavfname, mfccfname])
    srate = 16000
    #srate, sound = wavfile.read(wavfname)
    sound, srate = readwav(wavfname)
    if stereo_wav and len(sound.shape) == 2: # in mono sound is a list
        sound = 0.5 * (sound[:, 0] + sound[:, 1])
        # for stereo wav, sum both channels
    if gammatones:
        gammatonefname = bdir+'/'+fname[:-4]+'_gamma.npy'
        tmp_snd = loadsound(wavfname)
        gamma_cf = erbspace(20*Hz, 20*kHz, N_GAMMATONES_FILTERS)
        gamma_fb = Gammatone(tmp_snd, gamma_cf)
        with open(gammatonefname, 'w') as o_f:
            npsave(o_f, gamma_fb.process())
    if spectrograms:
        powerspec, _, _, _ = specgram(sound, NFFT=int(srate
            * SPECGRAM_WINDOW), Fs=srate, noverlap=int(srate
                * SPECGRAM_OVERLAP)) # TODO
        specgramfname = bdir+'/'+fname[:-4]+'_specgram.npy'
        with open(specgramfname, 'w') as o_f:
            npsave(o_f, powerspec.T)
    if filterbanks:
        # convert to Mel filterbanks
        fbanks = Spectral(nfilt=N_FBANKS,      # nb of filters in mel bank
                     alpha=0.97,               # pre-emphasis
                     do_dct=False,             # we do not want MFCCs
                     compression='log',
                     fs=srate,                 # sampling rate
                     lowerf=50,                # lower frequency
                     frate=FBANKS_RATE,        # frame rate
                     wlen=FBANKS_WINDOW,       # window length
                     nfft=1024,                # length of dft
                     do_deltas=False,          # speed
                     do_deltasdeltas=False     # acceleration
                     )
        sound /= np.abs(sound).max(axis=0)  # TODO put that as option
        fbank = fbanks.transform(sound)
        fbanksfname = bdir+'/'+fname[:-4]+'_fbanks.npy'
        with open(fbanksfname, 'w') as o_f:
            npsave(o_f, fbank)
    # TODO wavelets scattergrams / scalograms
    print "dealt with file", wavfname
示例#2
0
def main(args, samplegraph):
    t1 = time.time()
    g = Graph()
    print("Reading...")
    g.read_edgelist(samplegraph)
    model = Spectral(graph=g, dim=args.representation_size)
    t2 = time.time()
    print(t2 - t1)
    print("Saving embeddings...")
    model.save_embeddings(args.output)
示例#3
0
def extract_features(df, label2ix, spec_kwargs, vad_kwargs,
                     stacksize=1, frate=100, return_y=False):
    if return_y:
        return_y = 'label' in df
    X = {}
    if return_y:
        y = {}
    spectrum_encoder = Spectral(**spec_kwargs)
    vad_encoder = VAD(**vad_kwargs)
    for ix, fname in enumerate(df.filename.unique()):
        sig, fs = wavread(fname)
        if fs != spec_kwargs['fs']:
            raise ValueError('expected samplerate {}, got {}'.format(
                spec_kwargs['fs'], fs)
            )
        spec = spectrum_encoder.transform(sig)
        spec = (spec - spec.mean(0)) / spec.std(0)
        if stacksize > 1:
            spec = roll_array(spec, stacksize)
        vad = vad_encoder.activations(sig)
        vad = vad.reshape(vad.shape[0], -1)
        if stacksize > 1:
            vad = roll_array(vad, stacksize)

        X_curr = []
        if return_y:
            y_curr = []

        rows_iter = df[df.filename == fname].iterrows()
        for _, row in rows_iter:
            start = row.start
            end = row.end

            start_fr = int(start * frate)
            end_fr = int(end * frate)

            feat = np.hstack(
                (spec[start_fr: end_fr],
                 vad[start_fr: end_fr])
            )
            X_curr.append(
                feat.astype(np.float32)
            )
            if return_y:
                y_curr.append(
                    np.ones(feat.shape[0], dtype=np.uint8) * \
                    label2ix[row.label]
                )
        X[fname] = np.vstack(X_curr)
        if return_y:
            y[fname] = np.hstack(y_curr)
    if return_y:
        return X, y
    else:
        return X
示例#4
0
def extract_features(df,
                     label2ix,
                     spec_kwargs,
                     vad_kwargs,
                     stacksize=1,
                     frate=100,
                     return_y=False):
    if return_y:
        return_y = 'label' in df
    X = {}
    if return_y:
        y = {}
    spectrum_encoder = Spectral(**spec_kwargs)
    vad_encoder = VAD(**vad_kwargs)
    for ix, fname in enumerate(df.filename.unique()):
        sig, fs = wavread(fname)
        if fs != spec_kwargs['fs']:
            raise ValueError('expected samplerate {}, got {}'.format(
                spec_kwargs['fs'], fs))
        spec = spectrum_encoder.transform(sig)
        spec = (spec - spec.mean(0)) / spec.std(0)
        if stacksize > 1:
            spec = roll_array(spec, stacksize)
        vad = vad_encoder.activations(sig)
        vad = vad.reshape(vad.shape[0], -1)
        if stacksize > 1:
            vad = roll_array(vad, stacksize)

        X_curr = []
        if return_y:
            y_curr = []

        rows_iter = df[df.filename == fname].iterrows()
        for _, row in rows_iter:
            start = row.start
            end = row.end

            start_fr = int(start * frate)
            end_fr = int(end * frate)

            feat = np.hstack((spec[start_fr:end_fr], vad[start_fr:end_fr]))
            X_curr.append(feat.astype(np.float32))
            if return_y:
                y_curr.append(
                    np.ones(feat.shape[0], dtype=np.uint8) * \
                    label2ix[row.label]
                )
        X[fname] = np.vstack(X_curr)
        if return_y:
            y[fname] = np.hstack(y_curr)
    if return_y:
        return X, y
    else:
        return X
示例#5
0
def do_fbank(fname):
    srate, sound = wavfile.read(fname)
    fbanks = Spectral(nfilt=N_FBANKS,    # nb of filters in mel bank
                 alpha=0.97,             # pre-emphasis
                 do_dct=False,           # we do not want MFCCs
                 fs=srate,               # sampling rate
                 frate=FBANKS_RATE,      # frame rate
                 wlen=FBANKS_WINDOW,     # window length
                 nfft=1024,              # length of dft
                 do_deltas=False,       # speed
                 do_deltasdeltas=False  # acceleration
                 )
    fb = fbanks.transform(sound)
    return fb
示例#6
0
def do_fbank(fname):
    srate, sound = wavfile.read(fname)
    fbanks = Spectral(
        nfilt=40,               # nb of filters in mel bank
        alpha=0.97,             # pre-emphasis
        do_dct=False,           # we do not want MFCCs
        fs=srate,               # sampling rate
        frate=100,              # frame rate
        wlen=0.025,             # window length
        nfft=1024,              # length of dft
        do_deltas=False,        # speed
        do_deltasdeltas=False   # acceleration
    )
    fb = np.array(fbanks.transform(sound), dtype='float32')
    return fb
示例#7
0
def do_fbank(fname):
    srate, sound = wavfile.read(fname)
    fbanks = Spectral(
        nfilt=N_FBANKS,  # nb of filters in mel bank
        alpha=0.97,  # pre-emphasis
        do_dct=False,  # we do not want MFCCs
        fs=srate,  # sampling rate
        frate=FBANKS_RATE,  # frame rate
        wlen=FBANKS_WINDOW,  # window length
        nfft=1024,  # length of dft
        do_deltas=False,  # speed
        do_deltasdeltas=False  # acceleration
    )
    fb = fbanks.transform(sound)
    print "did:", fname
    #print fbnk.shape
    return fb
示例#8
0
def do_mfccs(fname):
    """Compute standard mfccs from a wav file"""
    srate, sound = wavfile.read(fname)
    fbanks = Spectral(
        nfilt=40,               # nb of filters in mel bank
        alpha=0.97,             # pre-emphasis
        fs=srate,               # sampling rate
        frate=100,              # frame rate
        wlen=0.025,             # window length
        nfft=512,               # length of dft
        ncep=13,                # nb of cepstral coefficients
        lowerf=100,
        upperf=6855.4976,
        do_deltas=False,        # speed
        do_deltasdeltas=False   # acceleration
    )
    fb = np.array(fbanks.transform(sound), dtype='float32')
    return fb
示例#9
0
def do_mfccs(fname):
    """Compute standard mfccs from a wav file"""
    srate, sound = wavfile.read(fname)
    fbanks = Spectral(
        nfilt=40,  # nb of filters in mel bank
        alpha=0.97,  # pre-emphasis
        fs=srate,  # sampling rate
        frate=100,  # frame rate
        wlen=0.025,  # window length
        nfft=512,  # length of dft
        ncep=13,  # nb of cepstral coefficients
        lowerf=100,
        upperf=6855.4976,
        do_deltas=False,  # speed
        do_deltasdeltas=False  # acceleration
    )
    fb = np.array(fbanks.transform(sound), dtype='float32')
    return fb
def do_fbank(fname):
    """Compute standard filterbanks from a wav file"""
    sound, srate = sf.read(fname)
    #f = Sndfile(fname,'r')
    #srate = f.samplerate
    #nf = f.nframes
    #sound = f.read_frames(nf)
    fbanks = Spectral(
        nfilt=40,  # nb of filters in mel bank
        alpha=0.97,  # pre-emphasis
        do_dct=False,  # we do not want MFCCs
        fs=srate,  # sampling rate
        frate=100,  # frame rate
        wlen=0.025,  # window length
        nfft=1024,  # length of dft
        do_deltas=False,  # speed
        do_deltasdeltas=False  # acceleration
    )
    fb = np.array(fbanks.transform(sound), dtype='float32')
    return fb
示例#11
0
def do_fbank(fname):
    fn = bdir + fname + '.wav'
    try:
        with open(fn[:-3] + 'npy', 'rb') as rfb:
            fb = np.load(rfb)
    except IOError:
        srate, sound = wavfile.read(fn)
        fbanks = Spectral(nfilt=N_FBANKS,    # nb of filters in mel bank
                     alpha=0.97,             # pre-emphasis
                     do_dct=False,           # we do not want MFCCs
                     fs=srate,               # sampling rate
                     frate=FBANKS_RATE,      # frame rate
                     wlen=FBANKS_WINDOW,     # window length
                     nfft=1024,              # length of dft
                     do_deltas=False,       # speed
                     do_deltasdeltas=False  # acceleration
                     )
        fb = np.array(fbanks.transform(sound), dtype='float32')
    print "did:", fn
    #print fb.shape
    return fb
示例#12
0
def run_all_spectral_clustering(
        rad,
        date_range,
        boxcox=False,
        norm=True,
        params=["bmnum", "v", "p_l", "w_l", "slist", "elv", "time_index"],
        methods=["spc", "spcb", "spcc"],
        m_params={
            "spc": {},
            "spcb": {},
            "spcc": {}
        },
        n_clusters=20):
    """
    Invoke all spectral clustering algorithm
    rad: Radar code
    date_range: Date range
    """
    fd = FetchData(rad, date_range)
    beams, _ = fd.fetch_data(
        v_params=["elv", "v", "w_l", "gflg", "p_l", "slist", "v_e"])
    rec = fd.convert_to_pandas(beams)
    rec["time_index"] = utils.time_days_to_index(
        [x.to_pydatetime() for x in rec["time"].tolist()])
    if boxcox: rec = utils.boxcox_tx(rec)
    if norm: rec = utils.normalize(rec, params)
    print("\n", rec.head())
    for method in methods:
        print("\n >> Running {c} clustering".format(c=method))
        model = Spectral(method, rec[params].values, n_clusters)
        model.setup(m_params[method])
        model.run()

        print("\n Estimating model skills.")
        skill = Skills(model.data, model.obj.labels_)
    return
示例#13
0
    def __init__(self,
                 stacksize=40,
                 normalize='mvn',
                 n_noise_fr=0,
                 fs=16000,
                 window_length=0.050,
                 window_shift=0.010,
                 nfft=1024,
                 scale='mel',
                 lowerf=120,
                 upperf=7000,
                 nfilt=40,
                 taper_filt=True,
                 compression='log',
                 dct=False,
                 nceps=13,
                 log_e=True,
                 lifter=22,
                 deltas=False,
                 remove_dc=False,
                 medfilt_t=0,
                 medfilt_s=(0, 0),
                 noise_fr=0,
                 pre_emph=0.97,
                 feat_cache=None,
                 noise_cache=None,
                 wav_cache=None,
                 n_jobs=1,
                 verbose=False):
        self.stacksize = stacksize
        self.normalize = normalize
        if self.normalize == 'mvn':
            self.normalizer = StandardScaler()
        elif self.normalize == 'zca':
            self.normalizer = ZCA()
        elif self.normalize == 'minmax':
            self.normalizer = MinMaxScaler()
        else:
            self.normalizer = IdentityTransform()
        self.n_noise_fr = n_noise_fr
        self.fs = fs
        self.window_length = window_length
        self.window_shift = window_shift
        self.nfft = nfft
        self.scale = scale
        self.lowerf = lowerf
        self.upperf = upperf
        self.nfilt = nfilt
        self.taper_filt = taper_filt
        self.compression = compression
        self.dct = dct
        self.nceps = nceps
        self.log_e = log_e
        self.lifter = lifter
        self.deltas = deltas
        self.remove_dc = remove_dc
        self.medfilt_t = medfilt_t
        self.medfilt_s = medfilt_s
        self.noise_fr = noise_fr
        self.pre_emph = pre_emph

        self.n_jobs = n_jobs
        self.verbose = verbose

        self.encoder = Spectral(fs=fs,
                                window_length=window_length,
                                window_shift=window_shift,
                                nfft=nfft,
                                scale=scale,
                                lowerf=lowerf,
                                upperf=upperf,
                                nfilt=nfilt,
                                taper_filt=taper_filt,
                                compression=compression,
                                dct=dct,
                                nceps=nceps,
                                log_e=log_e,
                                lifter=lifter,
                                deltas=deltas,
                                remove_dc=remove_dc,
                                medfilt_t=medfilt_t,
                                medfilt_s=medfilt_s,
                                noise_fr=noise_fr,
                                pre_emph=pre_emph)
        self.D = self.encoder.n_features * self.stacksize
        self.wav_cache = wav_cache if wav_cache else {}
        self.noise_cache = noise_cache if noise_cache else {}
        self.feat_cache = feat_cache if feat_cache else {}
示例#14
0
class FeatureLoader(TransformerMixin, BaseEstimator):
    def __init__(self,
                 stacksize=40,
                 normalize='mvn',
                 n_noise_fr=0,
                 fs=16000,
                 window_length=0.050,
                 window_shift=0.010,
                 nfft=1024,
                 scale='mel',
                 lowerf=120,
                 upperf=7000,
                 nfilt=40,
                 taper_filt=True,
                 compression='log',
                 dct=False,
                 nceps=13,
                 log_e=True,
                 lifter=22,
                 deltas=False,
                 remove_dc=False,
                 medfilt_t=0,
                 medfilt_s=(0, 0),
                 noise_fr=0,
                 pre_emph=0.97,
                 feat_cache=None,
                 noise_cache=None,
                 wav_cache=None,
                 n_jobs=1,
                 verbose=False):
        self.stacksize = stacksize
        self.normalize = normalize
        if self.normalize == 'mvn':
            self.normalizer = StandardScaler()
        elif self.normalize == 'zca':
            self.normalizer = ZCA()
        elif self.normalize == 'minmax':
            self.normalizer = MinMaxScaler()
        else:
            self.normalizer = IdentityTransform()
        self.n_noise_fr = n_noise_fr
        self.fs = fs
        self.window_length = window_length
        self.window_shift = window_shift
        self.nfft = nfft
        self.scale = scale
        self.lowerf = lowerf
        self.upperf = upperf
        self.nfilt = nfilt
        self.taper_filt = taper_filt
        self.compression = compression
        self.dct = dct
        self.nceps = nceps
        self.log_e = log_e
        self.lifter = lifter
        self.deltas = deltas
        self.remove_dc = remove_dc
        self.medfilt_t = medfilt_t
        self.medfilt_s = medfilt_s
        self.noise_fr = noise_fr
        self.pre_emph = pre_emph

        self.n_jobs = n_jobs
        self.verbose = verbose

        self.encoder = Spectral(fs=fs,
                                window_length=window_length,
                                window_shift=window_shift,
                                nfft=nfft,
                                scale=scale,
                                lowerf=lowerf,
                                upperf=upperf,
                                nfilt=nfilt,
                                taper_filt=taper_filt,
                                compression=compression,
                                dct=dct,
                                nceps=nceps,
                                log_e=log_e,
                                lifter=lifter,
                                deltas=deltas,
                                remove_dc=remove_dc,
                                medfilt_t=medfilt_t,
                                medfilt_s=medfilt_s,
                                noise_fr=noise_fr,
                                pre_emph=pre_emph)
        self.D = self.encoder.n_features * self.stacksize
        self.wav_cache = wav_cache if wav_cache else {}
        self.noise_cache = noise_cache if noise_cache else {}
        self.feat_cache = feat_cache if feat_cache else {}

    def clear_cache(self):
        self.wav_cache = {}
        self.noise_cache = {}
        self.feat_cache = {}

    def get_params(self, deep=True):
        p = super(FeatureLoader, self).get_params()
        del p['n_jobs']
        del p['verbose']
        return p

    def get_key(self):
        """'Frozen' dictionary representation of this object's parameters.
        Used as key in caching.
        """
        p = self.get_params()
        del p['wav_cache']
        del p['noise_cache']
        del p['feat_cache']
        return tuple(sorted(p.items()))

    def _load_wav(self, fname):
        """
        Memoized audio loader.
        """
        key = fname
        if not key in self.wav_cache:
            sig, fs_ = wavread(fname)
            if self.fs != fs_:
                raise ValueError('sampling rate should be {0}, not {1}. '
                                 'please resample.'.format(self.fs, fs_))
            if len(sig.shape) > 1:
                warnings.warn('stereo audio: merging channels')
                sig = (sig[:, 0] + sig[:, 1]) / 2
            self.wav_cache[key] = sig
        return self.wav_cache[key]

    def _fill_noise_cache(self, X):
        for fname in X[:, 0]:
            self._extract_noise(fname)

    def _extract_noise(self, fname):
        cfg = (('fs', self.fs), ('window_length', self.window_length),
               ('window_shift', self.window_shift), ('nfft', self.nfft),
               ('remove_dc', self.remove_dc), ('medfilt_t', self.medfilt_t),
               ('medfilt_s', self.medfilt_s), ('pre_emph', self.pre_emph))
        key = (fname, cfg)
        if not key in self.noise_cache:
            if self.n_noise_fr == 0:
                self.noise_cache[key] = None
            else:
                sig = self._load_wav(fname)
                nsamples = (self.n_noise_fr + 2) * self.encoder.fshift
                spec = self.encoder.get_spectrogram(sig[:nsamples])[2:, :]
                noise = spec.mean(axis=0)
                noise = np.clip(noise, 1e-4, np.inf)
                self.noise_cache[key] = noise
        return self.noise_cache[key]

    def _fill_feat_cache(self, X_keys):
        sigs = [self._load_wav(fname) for fname, _ in X_keys]
        noises = [self._extract_noise(fname) for fname, _ in X_keys]
        p = Parallel(n_jobs=self.n_jobs, verbose=0)(
            delayed(extract_features_at)(sig, noise, start, self.stacksize,
                                         self.encoder)
            for (fname, start), sig, noise in izip(X_keys, sigs, noises))
        r = {x_key: feat for x_key, feat in izip(X_keys, p)}
        key = self.get_key()
        self.feat_cache[key].update(r)

    def get_specs(self, X):
        key = self.get_key()
        # list of [(filename, start)]
        X_keys = [(X[ix, 0], X[ix, 1]) for ix in xrange(X.shape[0])]
        if key in self.feat_cache:
            # check for missing keys
            missing_X_keys = [
                x_key for x_key in X_keys if not x_key in self.feat_cache[key]
            ]
            self._fill_feat_cache(missing_X_keys)
        else:
            self.feat_cache[key] = {}
            self._fill_feat_cache(X_keys)
        return np.vstack((self.feat_cache[key][x_key] for x_key in X_keys))

    def fit(self, X, y=None):
        """Load audio and optionally estimate mean and covar

        Parameters
        ----------
        X : ndarray with columns
            filename, start, end
        y :
        """
        r = self.get_specs(X)
        self.normalizer.fit(r)
        return self

    def transform(self, X, y=None):
        """Load audio and perform feature extraction.

        Parameters
        ----------
        X : ndarray
        """
        r = self.get_specs(X)
        r = self.normalizer.transform(r)
        return as_strided(r,
                          shape=(r.shape[0] // self.stacksize,
                                 r.shape[1] * self.stacksize),
                          strides=(r.strides[0] * self.stacksize,
                                   r.strides[1]))

    def fit_transform(self, X, y=None):
        r = self.get_specs(X)
        r = self.normalizer.fit_transform(r)
        return as_strided(r,
                          shape=(r.shape[0] // self.stacksize,
                                 r.shape[1] * self.stacksize),
                          strides=(r.strides[0] * self.stacksize,
                                   r.strides[1]))
示例#15
0
def process(folder,
        debug=False,
        htk_mfc=False,
        forcemfcext=False,
        stereo_wav=False,
        gammatones=False,
        spectrograms=False,
        filterbanks=False,
        sox=True):
    """ applies to all *.wav in folder """

    # first find if we produce normalized MFCC, otherwise note it in the ext
    # because we can then normalize on the whole corpus with another py script
    mfc_extension = '.mfc_unnorm'
    wcfg = open('wav_config', 'r')
    for line in wcfg:
        if "ENORMALISE" in line:
            mfc_extension = '.mfc'
    if forcemfcext:
        mfc_extension = '.mfc'
    print "MFC extension:", mfc_extension
    if gammatones:
        try:
            from brian import Hz, kHz
            from brian.hears import loadsound, erbspace, Gammatone
        except ImportError:
            print >> sys.stderr, "You need Brian Hears"
            print >> sys.stderr, "http://www.briansimulator.org/docs/\
                    hears.html"
            sys.exit(-1)
    if spectrograms:
        try:
            from pylab import specgram
        except ImportError:
            print >> sys.stderr, "You need Pylab"
            sys.exit(-1)
    fbanks = None
    if filterbanks:
        try:
            sys.path.append('../spectral')
            from spectral import Spectral
        except ImportError:
            print >> sys.stderr, "You need spectral (in the parent folder)"
            print >> sys.stderr, "https://github.com/mwv/spectral"
            sys.exit(-1)

    # run through all the folders and files in the path "folder"
    # and put a header to the waves, save the originals as .rawaudio
    # use HCopy to produce MFCC files according to "wav_config" file
    for bdir, _, files in os.walk(folder):
        for fname in files:
            if fname[-4:] != '.wav':
                continue
            rawfname = bdir+'/'+fname[:-4]+'.rawaudio'
            wavfname = bdir+'/'+fname
            tempfname = bdir+'/'+fname[:-4]+'_temp.wav'
            # temp fname with .wav for sox
            mfccfname = bdir+'/'+fname[:-4]+mfc_extension
            if sox:
                shutil.move(wavfname, tempfname)
                call(['sox', tempfname, wavfname])
                # w/o headers, sox uses extension
                shutil.move(tempfname, rawfname)
            if htk_mfc:
                call(['HCopy', '-C', 'wav_config', wavfname, mfccfname])
            srate = 16000
            srate, sound = wavfile.read(wavfname)
            if stereo_wav and len(sound.shape) == 2: # in mono sound is a list
                sound = sound[:, 0] + sound[:, 1]
                # for stereo wav, sum both channels
            if gammatones:
                gammatonefname = bdir+'/'+fname[:-4]+'_gamma.npy'
                tmp_snd = loadsound(wavfname)
                gamma_cf = erbspace(20*Hz, 20*kHz, N_GAMMATONES_FILTERS)
                gamma_fb = Gammatone(tmp_snd, gamma_cf)
                with open(gammatonefname, 'w') as o_f:
                    npsave(o_f, gamma_fb.process())
            if spectrograms:
                powerspec, _, _, _ = specgram(sound, NFFT=int(srate
                    * SPECGRAM_WINDOW), Fs=srate, noverlap=int(srate
                        * SPECGRAM_OVERLAP)) # TODO
                specgramfname = bdir+'/'+fname[:-4]+'_specgram.npy'
                with open(specgramfname, 'w') as o_f:
                    npsave(o_f, powerspec.T)
            if filterbanks:
                # convert to Mel filterbanks
                if fbanks == None: # assume parameters are fixed
                    fbanks = Spectral(nfilt=N_FBANKS,    # nb of filters in mel bank
                                 alpha=0.97,             # pre-emphasis
                                 do_dct=False,           # we do not want MFCCs
                                 fs=srate,               # sampling rate
                                 frate=FBANKS_RATE,      # frame rate
                                 wlen=FBANKS_WINDOW,     # window length
                                 nfft=1024,              # length of dft
                                 do_deltas=False,       # speed
                                 do_deltasdeltas=False  # acceleration
                                 )
                fbank = fbanks.transform(sound)[0]  # first dimension is for
                                                    # deltas & deltasdeltas
                fbanksfname = bdir+'/'+fname[:-4]+'_fbanks.npy'
                with open(fbanksfname, 'w') as o_f:
                    npsave(o_f, fbank)
            # TODO wavelets scattergrams / scalograms
            print "dealt with file", wavfname
示例#16
0
    _, _, fs, nframes, _, _ = fid.getparams()
    sig = np.array(struct.unpack_from("%dh" % nframes,
                                      fid.readframes(nframes)))
    fid.close()
    return sig, fs


FBANKS_WINDOW = 0.025  # 25ms
FBANKS_RATE = 100  # 10ms
N_FBANKS = 40

for wavfname in sys.argv[1:]:
    sound, srate = readwav(wavfname)
    fbanks = Spectral(
        nfilt=N_FBANKS,  # nb of filters in mel bank
        alpha=0.97,  # pre-emphasis
        do_dct=False,  # we do not want MFCCs
        compression='log',
        fs=srate,  # sampling rate
        #lowerf=50,                # lower frequency
        frate=FBANKS_RATE,  # frame rate
        wlen=FBANKS_WINDOW,  # window length
        nfft=1024,  # length of dft
        do_deltas=False,  # speed
        do_deltasdeltas=False  # acceleration
    )
    fbank = fbanks.transform(sound)
    fbanksfname = wavfname[:-4] + '_fbanks.npy'
    with open(fbanksfname, 'w') as o_f:
        np.save(o_f, fbank)
示例#17
0
def process(folder,
            debug=False,
            htk_mfc=False,
            forcemfcext=False,
            stereo_wav=False,
            gammatones=False,
            spectrograms=False,
            filterbanks=False,
            sox=True):
    """ applies to all *.wav in folder """

    # first find if we produce normalized MFCC, otherwise note it in the ext
    # because we can then normalize on the whole corpus with another py script
    mfc_extension = '.mfc_unnorm'
    wcfg = open('wav_config', 'r')
    for line in wcfg:
        if "ENORMALISE" in line:
            mfc_extension = '.mfc'
    if forcemfcext:
        mfc_extension = '.mfc'
    print "MFC extension:", mfc_extension
    if gammatones:
        try:
            from brian import Hz, kHz
            from brian.hears import loadsound, erbspace, Gammatone
        except ImportError:
            print >> sys.stderr, "You need Brian Hears"
            print >> sys.stderr, "http://www.briansimulator.org/docs/\
                    hears.html"

            sys.exit(-1)
    if spectrograms:
        try:
            from pylab import specgram
        except ImportError:
            print >> sys.stderr, "You need Pylab"
            sys.exit(-1)
    fbanks = None
    if filterbanks:
        try:
            sys.path.append('../spectral')
            from spectral import Spectral
        except ImportError:
            print >> sys.stderr, "You need spectral (in the parent folder)"
            print >> sys.stderr, "https://github.com/mwv/spectral"
            sys.exit(-1)

    # run through all the folders and files in the path "folder"
    # and put a header to the waves, save the originals as .rawaudio
    # use HCopy to produce MFCC files according to "wav_config" file
    for bdir, _, files in os.walk(folder):
        for fname in files:
            if fname[-4:] != '.wav':
                continue
            rawfname = bdir + '/' + fname[:-4] + '.rawaudio'
            wavfname = bdir + '/' + fname
            tempfname = bdir + '/' + fname[:-4] + '_temp.wav'
            # temp fname with .wav for sox
            mfccfname = bdir + '/' + fname[:-4] + mfc_extension
            if sox:
                shutil.move(wavfname, tempfname)
                call(['sox', tempfname, wavfname])
                # w/o headers, sox uses extension
                shutil.move(tempfname, rawfname)
            if htk_mfc:
                call(['HCopy', '-C', 'wav_config', wavfname, mfccfname])
            srate = 16000
            srate, sound = wavfile.read(wavfname)
            if stereo_wav and len(sound.shape) == 2:  # in mono sound is a list
                sound = sound[:, 0] + sound[:, 1]
                # for stereo wav, sum both channels
            if gammatones:
                gammatonefname = bdir + '/' + fname[:-4] + '_gamma.npy'
                tmp_snd = loadsound(wavfname)
                gamma_cf = erbspace(20 * Hz, 20 * kHz, N_GAMMATONES_FILTERS)
                gamma_fb = Gammatone(tmp_snd, gamma_cf)
                with open(gammatonefname, 'w') as o_f:
                    npsave(o_f, gamma_fb.process())
            if spectrograms:
                powerspec, _, _, _ = specgram(
                    sound,
                    NFFT=int(srate * SPECGRAM_WINDOW),
                    Fs=srate,
                    noverlap=int(srate * SPECGRAM_OVERLAP))  # TODO
                specgramfname = bdir + '/' + fname[:-4] + '_specgram.npy'
                with open(specgramfname, 'w') as o_f:
                    npsave(o_f, powerspec.T)
            if filterbanks:
                # convert to Mel filterbanks
                if fbanks == None:  # assume parameters are fixed
                    fbanks = Spectral(
                        nfilt=N_FBANKS,  # nb of filters in mel bank
                        alpha=0.97,  # pre-emphasis
                        do_dct=False,  # we do not want MFCCs
                        fs=srate,  # sampling rate
                        frate=FBANKS_RATE,  # frame rate
                        wlen=FBANKS_WINDOW,  # window length
                        nfft=1024,  # length of dft
                        do_deltas=False,  # speed
                        do_deltasdeltas=False  # acceleration
                    )
                fbank = fbanks.transform(sound)[0]  # first dimension is for
                # deltas & deltasdeltas
                fbanksfname = bdir + '/' + fname[:-4] + '_fbanks.npy'
                with open(fbanksfname, 'w') as o_f:
                    npsave(o_f, fbank)
            # TODO wavelets scattergrams / scalograms
            print "dealt with file", wavfname
示例#18
0
def process(folder,debug=False,htk_mfcc=False,forcemfcext=False,stereo_wave=False,gammatones=False,spectograms=False,filterbanks=False,sox=True):
    mfc_extension = '.mfc_unnorm'
    wcfg = open('wav_config','r')
    for line in wcfg:
        if "ENORMALISE" in line:
            mfc_extension = '.mfc'

    if forcemfcext:
        mfc_extension = '.mfc'
    print "MFC Extension is", mfc_extension
    if gammatones:
        try:
            from brian import Hz, kHz
            from brian.hears import loadsound, erbspace, Gammatone

        except ImportError:
            print >> sys.stderr, "You need Brian Hears"

            sys.exit(-1)

    if spectograms:
        try:
            from pylab import specgram

        except ImportError:
            print >> sys.stderr,'You need Pylab'
            sys.exit(-1)

    fbanks = None
    if filterbanks:
        try:
            sys.path.append('../spectral')
            from spectral import Spectral

        except ImportError:
            print >> sys.stderr, 'you need spectral (in the parent folder)'

    for bdir, _ , files in  os.walk(folder):
        for fname in files:
            if fname[-4:] != '.WAV':
                continue
            rawfname= bdir + '/' + fname[:-4]+'.rawaudio'
            wavfname = bdir + '/'+ fname
            tempfname = bdir + '/' + fname[:-4] + '_temp.wav'
            mfccfname = bdir + '/' + fname[:-4] + '.txt'
            if sox:
                shutil.move(wavfname, tempfname)
                call(['sox',tempfname,wavfname])
                shutil.move(tempfname,wavfname)

            if htk_mfcc:
                call(['HCopy','-C','wav_config',wavfname,mfccfname])
            srate = 16000

            srate, sound = wavfile.read(wavfname)
            if stereo_wave and len(sound.shape == 2):
                sound = sound[:,0]+ sound[:,1]
            if gammatones:
                gammatonefname = bdir + '/' + fname[:-4] + '_gamma.npy'
                tmp_snd = loadsound(wavfname)
                gamma_cf = erbspace(20*Hz, 20*kHz, n_gmammatones_filters)
                gamma_fb = Gammatone(tmp_snd, gamma_cf)
                with open(gammatonefname,'w') as o_f:
                    npsave(o_f, gamma_fb.process())

            if spectograms:
                powersspec, _,_,_ = specgram(sound, NFFT=int(srate * specgram_window), Fs=srate,noverlap=int(srate*specgram_window))
                specgramfname = bdir + '/' + fname[:-4]+'_specgram.npy'
                with open(specgramfname,'w') as o_f:
                    npsave(o_f , powerspec.T)
            if filterbanks:
                if fbanks ==None:
                    fbanks = Spectral(nfilt = n_fbanks, alpha=0.97,do_dct=False, fs=srate, frate=fbanks_rate, wlen=fbanks_window,nfft=1024,do_deltas=False,do_deltasdeltas=False)
                fbank = fbanks.transform(sound)[0]
                fbanksfname = bdir + '/' + fname[:-4]+'_fbanks.npy'
                with open(fbanksfname,'w') as o_f:
                    npsave(o_f, fbank)
            print "Dealt with the file ", wavfname
示例#19
0
class FeatureLoader(TransformerMixin, BaseEstimator):
    def __init__(self,
                 stacksize=40,
                 normalize='mvn',
                 n_noise_fr=0,
                 fs=16000,
                 window_length=0.050,
                 window_shift=0.010,
                 nfft=1024,
                 scale='mel',
                 lowerf=120,
                 upperf=7000,
                 nfilt=40,
                 taper_filt=True,
                 compression='log',
                 dct=False,
                 nceps=13,
                 log_e=True,
                 lifter=22,
                 deltas=False,
                 remove_dc=False,
                 medfilt_t=0,
                 medfilt_s=(0, 0),
                 noise_fr=0,
                 pre_emph=0.97,
                 feat_cache=None, noise_cache=None, wav_cache=None,
                 n_jobs=1, verbose=False):
        self.stacksize = stacksize
        self.normalize = normalize
        if self.normalize == 'mvn':
            self.normalizer = StandardScaler()
        elif self.normalize == 'zca':
            self.normalizer = ZCA()
        elif self.normalize == 'minmax':
            self.normalizer = MinMaxScaler()
        else:
            self.normalizer = IdentityTransform()
        self.n_noise_fr = n_noise_fr
        self.fs = fs
        self.window_length = window_length
        self.window_shift = window_shift
        self.nfft = nfft
        self.scale = scale
        self.lowerf = lowerf
        self.upperf = upperf
        self.nfilt = nfilt
        self.taper_filt = taper_filt
        self.compression = compression
        self.dct = dct
        self.nceps = nceps
        self.log_e = log_e
        self.lifter = lifter
        self.deltas = deltas
        self.remove_dc = remove_dc
        self.medfilt_t = medfilt_t
        self.medfilt_s = medfilt_s
        self.noise_fr = noise_fr
        self.pre_emph = pre_emph

        self.n_jobs = n_jobs
        self.verbose = verbose

        self.encoder = Spectral(
            fs=fs,
            window_length=window_length,
            window_shift=window_shift,
            nfft=nfft,
            scale=scale,
            lowerf=lowerf,
            upperf=upperf,
            nfilt=nfilt,
            taper_filt=taper_filt,
            compression=compression,
            dct=dct,
            nceps=nceps,
            log_e=log_e,
            lifter=lifter,
            deltas=deltas,
            remove_dc=remove_dc,
            medfilt_t=medfilt_t,
            medfilt_s=medfilt_s,
            noise_fr=noise_fr,
            pre_emph=pre_emph
        )
        self.D = self.encoder.n_features * self.stacksize
        self.wav_cache = wav_cache if wav_cache else {}
        self.noise_cache = noise_cache if noise_cache else {}
        self.feat_cache = feat_cache if feat_cache else {}

    def clear_cache(self):
        self.wav_cache = {}
        self.noise_cache = {}
        self.feat_cache = {}

    def get_params(self, deep=True):
        p = super(FeatureLoader, self).get_params()
        del p['n_jobs']
        del p['verbose']
        return p

    def get_key(self):
        """'Frozen' dictionary representation of this object's parameters.
        Used as key in caching.
        """
        p = self.get_params()
        del p['wav_cache']
        del p['noise_cache']
        del p['feat_cache']
        return tuple(sorted(p.items()))

    def _load_wav(self, fname):
        """
        Memoized audio loader.
        """
        key = fname
        if not key in self.wav_cache:
            sig, fs_ = wavread(fname)
            if self.fs != fs_:
                raise ValueError('sampling rate should be {0}, not {1}. '
                                 'please resample.'.format(self.fs, fs_))
            if len(sig.shape) > 1:
                warnings.warn('stereo audio: merging channels')
                sig = (sig[:, 0] + sig[:, 1]) / 2
            self.wav_cache[key] = sig
        return self.wav_cache[key]

    def _fill_noise_cache(self, X):
        for fname in X[:, 0]:
            self._extract_noise(fname)

    def _extract_noise(self, fname):
        cfg = (
            ('fs', self.fs),
            ('window_length', self.window_length),
            ('window_shift', self.window_shift),
            ('nfft', self.nfft),
            ('remove_dc', self.remove_dc),
            ('medfilt_t', self.medfilt_t),
            ('medfilt_s', self.medfilt_s),
            ('pre_emph', self.pre_emph)
        )
        key = (fname, cfg)
        if not key in self.noise_cache:
            if self.n_noise_fr == 0:
                self.noise_cache[key] = None
            else:
                sig = self._load_wav(fname)
                nsamples = (self.n_noise_fr + 2) * self.encoder.fshift
                spec = self.encoder.get_spectrogram(sig[:nsamples])[2:, :]
                noise = spec.mean(axis=0)
                noise = np.clip(noise, 1e-4, np.inf)
                self.noise_cache[key] = noise
        return self.noise_cache[key]

    def _fill_feat_cache(self, X_keys):
        sigs = [self._load_wav(fname) for fname, _ in X_keys]
        noises = [self._extract_noise(fname) for fname, _ in X_keys]
        p = Parallel(n_jobs=self.n_jobs, verbose=0)(
            delayed(extract_features_at)(
                sig, noise, start, self.stacksize, self.encoder)
            for (fname, start), sig, noise in izip(X_keys, sigs, noises)
        )
        r = {x_key: feat
             for x_key, feat in izip(X_keys, p)}
        key = self.get_key()
        self.feat_cache[key].update(r)

    def get_specs(self, X):
        key = self.get_key()
        # list of [(filename, start)]
        X_keys = [(X[ix, 0], X[ix, 1]) for ix in xrange(X.shape[0])]
        if key in self.feat_cache:
            # check for missing keys
            missing_X_keys = [
                x_key
                for x_key in X_keys
                if not x_key in self.feat_cache[key]
            ]
            self._fill_feat_cache(missing_X_keys)
        else:
            self.feat_cache[key] = {}
            self._fill_feat_cache(X_keys)
        return np.vstack((self.feat_cache[key][x_key] for x_key in X_keys))

    def fit(self, X, y=None):
        """Load audio and optionally estimate mean and covar

        Parameters
        ----------
        X : ndarray with columns
            filename, start, end
        y :
        """
        r = self.get_specs(X)
        self.normalizer.fit(r)
        return self

    def transform(self, X, y=None):
        """Load audio and perform feature extraction.

        Parameters
        ----------
        X : ndarray
        """
        r = self.get_specs(X)
        r = self.normalizer.transform(r)
        return as_strided(
            r,
            shape=(r.shape[0]//self.stacksize, r.shape[1]*self.stacksize),
            strides=(r.strides[0]*self.stacksize, r.strides[1])
        )

    def fit_transform(self, X, y=None):
        r = self.get_specs(X)
        r = self.normalizer.fit_transform(r)
        return as_strided(
            r,
            shape=(r.shape[0]//self.stacksize, r.shape[1]*self.stacksize),
            strides=(r.strides[0]*self.stacksize, r.strides[1])
        )
示例#20
0
    def __init__(self,
                 stacksize=40,
                 normalize='mvn',
                 n_noise_fr=0,
                 fs=16000,
                 window_length=0.050,
                 window_shift=0.010,
                 nfft=1024,
                 scale='mel',
                 lowerf=120,
                 upperf=7000,
                 nfilt=40,
                 taper_filt=True,
                 compression='log',
                 dct=False,
                 nceps=13,
                 log_e=True,
                 lifter=22,
                 deltas=False,
                 remove_dc=False,
                 medfilt_t=0,
                 medfilt_s=(0, 0),
                 noise_fr=0,
                 pre_emph=0.97,
                 feat_cache=None, noise_cache=None, wav_cache=None,
                 n_jobs=1, verbose=False):
        self.stacksize = stacksize
        self.normalize = normalize
        if self.normalize == 'mvn':
            self.normalizer = StandardScaler()
        elif self.normalize == 'zca':
            self.normalizer = ZCA()
        elif self.normalize == 'minmax':
            self.normalizer = MinMaxScaler()
        else:
            self.normalizer = IdentityTransform()
        self.n_noise_fr = n_noise_fr
        self.fs = fs
        self.window_length = window_length
        self.window_shift = window_shift
        self.nfft = nfft
        self.scale = scale
        self.lowerf = lowerf
        self.upperf = upperf
        self.nfilt = nfilt
        self.taper_filt = taper_filt
        self.compression = compression
        self.dct = dct
        self.nceps = nceps
        self.log_e = log_e
        self.lifter = lifter
        self.deltas = deltas
        self.remove_dc = remove_dc
        self.medfilt_t = medfilt_t
        self.medfilt_s = medfilt_s
        self.noise_fr = noise_fr
        self.pre_emph = pre_emph

        self.n_jobs = n_jobs
        self.verbose = verbose

        self.encoder = Spectral(
            fs=fs,
            window_length=window_length,
            window_shift=window_shift,
            nfft=nfft,
            scale=scale,
            lowerf=lowerf,
            upperf=upperf,
            nfilt=nfilt,
            taper_filt=taper_filt,
            compression=compression,
            dct=dct,
            nceps=nceps,
            log_e=log_e,
            lifter=lifter,
            deltas=deltas,
            remove_dc=remove_dc,
            medfilt_t=medfilt_t,
            medfilt_s=medfilt_s,
            noise_fr=noise_fr,
            pre_emph=pre_emph
        )
        self.D = self.encoder.n_features * self.stacksize
        self.wav_cache = wav_cache if wav_cache else {}
        self.noise_cache = noise_cache if noise_cache else {}
        self.feat_cache = feat_cache if feat_cache else {}
示例#21
0
    fid = wave.open(fname, 'r')
    _, _, fs, nframes, _, _ = fid.getparams()
    sig = np.array(struct.unpack_from("%dh" % nframes,
    fid.readframes(nframes)))
    fid.close()
    return sig, fs

FBANKS_WINDOW = 0.025 # 25ms
FBANKS_RATE = 100 # 10ms
N_FBANKS = 40

for wavfname in sys.argv[1:]:
    sound, srate = readwav(wavfname)
    fbanks = Spectral(nfilt=N_FBANKS,      # nb of filters in mel bank
            alpha=0.97,               # pre-emphasis
            do_dct=False,             # we do not want MFCCs
            compression='log',
            fs=srate,                 # sampling rate
            #lowerf=50,                # lower frequency
            frate=FBANKS_RATE,        # frame rate
            wlen=FBANKS_WINDOW,       # window length
            nfft=1024,                # length of dft
            do_deltas=False,          # speed
            do_deltasdeltas=False     # acceleration
            )
    fbank = fbanks.transform(sound)
    fbanksfname = wavfname[:-4]+'_fbanks.npy'
    with open(fbanksfname, 'w') as o_f:
        np.save(o_f, fbank)