-
Notifications
You must be signed in to change notification settings - Fork 0
/
mfcc.py
119 lines (98 loc) · 3.76 KB
/
mfcc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import numpy as np
from scipy.io import loadmat
from scipy.signal import lfilter, hamming
from scipy.fftpack import fft
from scipy.fftpack.realtransforms import dct
from segmentaxis import segment_axis
import math
from mel import hz2mel
def trfbank(fs, nfft, lowfreq, linsc, logsc, nlinfilt, nlogfilt):
"""Compute triangular filterbank for MFCC computation."""
# Total number of filters
nfilt = nlinfilt + nlogfilt
#------------------------
# Compute the filter bank
#------------------------
# Compute start/middle/end points of the triangular filters in spectral
# domain
freqs = np.zeros(nfilt+2)
freqs[:nlinfilt] = lowfreq + np.arange(nlinfilt) * linsc
freqs[nlinfilt:] = freqs[nlinfilt-1] * logsc ** np.arange(1, nlogfilt + 3)
heights = 2./(freqs[2:] - freqs[0:-2])
# Compute filterbank coeff (in fft domain, in bins)
fbank = np.zeros((nfilt, nfft))
# FFT bins (in Hz)
nfreqs = np.arange(nfft) / (1. * nfft) * fs
for i in range(nfilt):
low = freqs[i]
cen = freqs[i+1]
hi = freqs[i+2]
lid = np.arange(np.floor(low * nfft / fs) + 1,
np.floor(cen * nfft / fs) + 1, dtype=np.int)
lslope = heights[i] / (cen - low)
rid = np.arange(np.floor(cen * nfft / fs) + 1,
np.floor(hi * nfft / fs) + 1, dtype=np.int)
rslope = heights[i] / (hi - cen)
fbank[i][lid] = lslope * (nfreqs[lid] - low)
fbank[i][rid] = rslope * (hi - nfreqs[rid])
return fbank, freqs
def mfcc(input, nwin=256, nfft=512, fs=16000, nceps=13):
"""Compute Mel Frequency Cepstral Coefficients.
Parameters
----------
input: ndarray
input from which the coefficients are computed
Returns
-------
ceps: ndarray
Mel-cepstrum coefficients
mspec: ndarray
Log-spectrum in the mel-domain.
Notes
-----
MFCC are computed as follows:
* Pre-processing in time-domain (pre-emphasizing)
* Compute the spectrum amplitude by windowing with a Hamming window
* Filter the signal in the spectral domain with a triangular
filter-bank, whose filters are approximatively linearly spaced on the
mel scale, and have equal bandwith in the mel scale
* Compute the DCT of the log-spectrum
References
----------
.. [1] S.B. Davis and P. Mermelstein, "Comparison of parametric
representations for monosyllabic word recognition in continuously
spoken sentences", IEEE Trans. Acoustics. Speech, Signal Proc.
ASSP-28 (4): 357-366, August 1980."""
# MFCC parameters: taken from auditory toolbox
over = 170
# Pre-emphasis factor (to take into account the -6dB/octave rolloff of the
# radiation at the lips level)
prefac = 0.97
#lowfreq = 400 / 3.
lowfreq = 133.33
#highfreq = 6855.4976
linsc = 200/3.
logsc = 1.0711703
nlinfil = 13
nlogfil = 27
nfil = nlinfil + nlogfil
w = hamming(nwin, sym=0)
fbank = trfbank(fs, nfft, lowfreq, linsc, logsc, nlinfil, nlogfil)[0]
#------------------
# Compute the MFCC
#------------------
extract = preemp(input, prefac)
framed = segment_axis(extract, nwin, over) * w
# Compute the spectrum magnitude
spec = np.abs(fft(framed, nfft, axis=-1))
# Filter the spectrum through the triangle filterbank
mspec = np.log10(np.clip(np.dot(spec, fbank.T), 1e-9, np.inf))
# Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain)
ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, :nceps]
return ceps, mspec, spec
def preemp(input, p):
"""Pre-emphasis filter."""
return lfilter([1., -p], 1, input)
if __name__ == '__main__':
extract = loadmat('extract.mat')['extract']
ceps = mfcc(extract)