Пример #1
0
def get_mel_scale(nfilt=20, samplerate=16000, lowfreq=20, highfreq=8000):
    highfreq = highfreq or samplerate / 2
    assert highfreq <= samplerate / 2, "highfreq is greater than samplerate/2"
    # compute points evenly spaced in mels
    lowmel = hz2mel(lowfreq)
    highmel = hz2mel(highfreq)
    melpoints = np.linspace(lowmel, highmel, nfilt + 2)
    return melpoints
Пример #2
0
    def __init__(self, input_dim, sr, num_filter, exp=False, filter_fix=False):
        super(fBPLayer, self).__init__()
        self.input_dim = input_dim
        self.num_filter = num_filter
        self.sr = sr
        self.exp = exp
        self.filter_fix = filter_fix

        requires_grad = not filter_fix
        input_freq = np.linspace(0, self.sr / 2, input_dim)
        self.input_freq = nn.Parameter(torch.from_numpy(input_freq).expand(num_filter, input_dim).float(),
                                       requires_grad=False)

        borders = np.linspace(0, hz2mel(sr / 2), num_filter + 2)
        borders = mel2hz(borders)

        self.bandwidth_low = nn.Parameter(torch.from_numpy(borders[:-2]).float().reshape(num_filter, 1),
                                          requires_grad=requires_grad)

        self.bandwidth = nn.Parameter(torch.from_numpy(borders[2:] - borders[:-2]).float().reshape(num_filter, 1),
                                      requires_grad=requires_grad)
Пример #3
0
    def __init__(self, input_dim, sr, num_filter, exp=False, filter_fix=False):
        super(fBLayer, self).__init__()
        self.input_dim = input_dim
        self.num_filter = num_filter
        self.sr = sr
        self.exp = exp
        self.filter_fix = filter_fix

        requires_grad = not filter_fix
        input_freq = np.linspace(0, self.sr / 2, input_dim)
        self.input_freq = nn.Parameter(torch.from_numpy(input_freq).expand(num_filter, input_dim).float(),
                                       requires_grad=False)

        centers = np.linspace(0, hz2mel(sr / 2), num_filter + 2)
        centers = mel2hz(centers)
        bandwidth = np.diff(centers)
        self.frequency_center = nn.Parameter(torch.from_numpy(centers[1:-1]).float().reshape(num_filter, 1),
                                             requires_grad=requires_grad)

        self.bandwidth_left = nn.Parameter(torch.from_numpy(bandwidth[:-1]).float().reshape(num_filter, 1),
                                           requires_grad=requires_grad)
        self.bandwidth_right = nn.Parameter(torch.from_numpy(bandwidth[1:]).float().reshape(num_filter, 1),
                                            requires_grad=requires_grad)
def read_wav(wav_path, feature_type='logmelfbank', batch_size=1):
    """Read wav file & convert to MFCC or log mel filterbank features.
    Args:
        wav_path: path to a wav file
        feature: logmelfbank or mfcc
    Returns:
        inputs: `[batch_size, max_time, feature_dim]`
        inputs_seq_len: `[batch_size, frame_num]`
    """
    # Load wav file
    fs, audio = scipy.io.wavfile.read(wav_path)

    if feature_type == 'mfcc':
        features = mfcc(audio, samplerate=fs)  # `[291, 13]`
    elif feature_type == 'logmelfbank':
        fbank_features, energy = fbank(audio, nfilt=40)
        logfbank = np.log(fbank_features)
        logenergy = np.log(energy)
        logmelfbank = hz2mel(logfbank)
        features = np.c_[logmelfbank, logenergy]  # `[291, 41]`

    delta1 = delta(features, N=2)
    delta2 = delta(delta1, N=2)
    input_data = np.c_[features, delta1, delta2]  # `[291, 123]`

    # Transform to 3D array
    # `[1, 291, 39]` or `[1, 291, 123]`
    inputs = np.zeros((batch_size, input_data.shape[0], input_data.shape[1]))
    for i in range(batch_size):
        inputs[i] = input_data
    inputs_seq_len = [inputs.shape[1]] * batch_size  # `[291]`

    # Normalization
    inputs = (inputs - np.mean(inputs)) / np.std(inputs)

    return inputs, inputs_seq_len
Пример #5
0
def get_filterbanks(nfilt=20,
                    nfft=512,
                    samplerate=16000,
                    lowfreq=0,
                    highfreq=None,
                    filtertype='mel',
                    multi_weight=False):
    """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
    to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)

    :param nfilt: the number of filters in the filterbank, default 20.
    :param nfft: the FFT size. Default is 512.
    :param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
    :param lowfreq: lowest band edge of mel filters, default 0 Hz
    :param highfreq: highest band edge of mel filters, default samplerate/2
    :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
    """

    highfreq = highfreq or samplerate / 2
    assert highfreq <= samplerate / 2, "highfreq is greater than samplerate/2"

    if filtertype == 'mel':
        # compute points evenly spaced in mels
        lowmel = hz2mel(lowfreq)
        highmel = hz2mel(highfreq)
        melpoints = np.linspace(lowmel, highmel, nfilt + 2)
        # our points are in Hz, but we use fft bins, so we have to convert from Hz to fft bin number
        bin = np.floor((nfft + 1) * mel2hz(melpoints) / samplerate)
    elif filtertype == 'amel':
        # compute points evenly spaced in mels
        lowmel = hz2amel(lowfreq)
        highmel = hz2amel(highfreq)
        melpoints = np.linspace(lowmel, highmel, nfilt + 2)
        # our points are in Hz, but we use fft bins, so we have to convert from Hz to fft bin number
        bin = np.floor((nfft + 1) * amel2hz(melpoints) / samplerate)

    elif filtertype == 'linear':
        linearpoints = np.linspace(lowfreq, highfreq, nfilt + 2)
        # our points are in Hz, but we use fft bins, so we have to convert from Hz to fft bin number
        bin = np.floor((nfft + 1) * linearpoints / samplerate)

    elif filtertype.startswith('dnn'):
        x = np.arange(0, 161) * samplerate / 2 / 160
        if filtertype.endswith('timit.fix'):
            y = np.array(c.TIMIT_FIlTER_FIX)
        elif filtertype.endswith('timit.var'):
            y = np.array(c.TIMIT_FIlTER_VAR)
        elif filtertype.endswith('timit.mdv'):
            y = np.array(c.TIMIT_FIlTER_MDV)
        elif filtertype.endswith('libri.fix'):
            y = np.array(c.LIBRI_FILTER_FIX)
        elif filtertype.endswith('libri.var'):
            y = np.array(c.LIBRI_FILTER_VAR)
        elif filtertype.endswith('vox1.soft'):
            y = np.array(c.VOX_FILTER_SOFT)
        elif filtertype == 'dnn.vox1':
            y = np.array(c.VOX_FILTER)

        f = interpolate.interp1d(x, y)
        x_new = np.arange(nfft // 2 + 1) * samplerate / 2 / (nfft // 2)
        lowfreq_idx = np.where(x_new >= lowfreq)[0]
        highfreq_idx = np.where(x_new <= highfreq)[0]
        ynew = f(x_new)  # 计算插值结果

        ynew[:int(lowfreq_idx[0])] = 0
        if highfreq_idx[-1] < len(x_new) - 1:
            ynew[int(highfreq[-1] + 1):] = 0

        weight = ynew / np.sum(ynew)

        bin = []
        bin.append(lowfreq_idx[0])

        for j in range(nfilt):
            num_wei = 0.
            for i in range(nfft // 2 + 1):
                num_wei += weight[i]
                if num_wei > (j + 1) / (nfilt + 1):
                    bin.append(i - 1)
                    break
                else:
                    continue

        bin.append(highfreq_idx[-1])

    fbank = np.zeros([nfilt, nfft // 2 + 1])
    for j in range(0, nfilt):
        for i in range(int(bin[j]), int(bin[j + 1])):
            fbank[j, i] = (i - bin[j]) / (bin[j + 1] - bin[j])

        for i in range(int(bin[j + 1]), int(bin[j + 2])):
            fbank[j, i] = (bin[j + 2] - i) / (bin[j + 2] - bin[j + 1])

    if multi_weight:
        y = np.array(c.TIMIT_FIlTER_VAR)
        fbank = fbank * (y / y.max())

    return fbank
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from python_speech_features import get_filterbanks, hz2mel
plt.rc('text', usetex=True)
plt.rc('font', family='serif')

nfilt, nfft, samplerate, lowfreq, highfreq = 7, 512, 16000, 0, 8000
fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 3))
colors = sns.cubehelix_palette(7, start=2, rot=0, dark=0.1, light=.7)

x = np.arange(0, 8001, 1)
y = [hz2mel(i) for i in x]

ax1.scatter(1000, 1000, s=30, color='red', alpha=0.9)
ax1.vlines(1000,
           ymin=0,
           ymax=1000,
           alpha=0.8,
           color='red',
           linestyle='--',
           linewidth=1)
ax1.hlines(1000,
           xmin=0,
           xmax=1000,
           alpha=0.8,
           color='red',
           linestyle='--',
           linewidth=1)
Пример #7
0
csf_feat = csf.logfbank(audio)
assert (np.shape(psf_feat) == np.shape(csf_feat))
error2d(psf_feat, csf_feat)

print ''
print 'ssc'
print '==='
psf_ssc = psf.ssc(audio)
csf_ssc = csf.ssc(audio)
assert (np.shape(psf_ssc) == np.shape(csf_ssc))
error2d(psf_ssc, csf_ssc)

print ''
print 'hz2mel'
print '======'
assert (get_error(psf.hz2mel(8000), csf.hz2mel(8000)) <= acceptable_error)
assert (get_error(psf.hz2mel(16000), csf.hz2mel(16000)) <= acceptable_error)
assert (get_error(csf.mel2hz(csf.hz2mel(8000)), 8000) <= acceptable_error)
print ' ✓'

print ''
print 'mel2hz'
print '======'
assert (get_error(psf.mel2hz(2595), csf.mel2hz(2595)) <= acceptable_error)
assert (get_error(csf.mel2hz(5190), csf.mel2hz(5190)) <= acceptable_error)
assert (get_error(csf.hz2mel(csf.mel2hz(2595)), 2595) <= acceptable_error)
print ' ✓'

print ''
print 'get_filterbanks'
print '==============='
Пример #8
0
def getmelpoint(_n_filt=N_FILT):
    lowmel = hz2mel(0)
    highmel = hz2mel(SAMPLING_RATE / 2)
    melpoints = np.linspace(lowmel, highmel, _n_filt + 1)
    return mel2hz(melpoints)[1:_n_filt + 1]
Пример #9
0
        self.input_dim = input_dim
        self.num_filter = num_filter
        self.sr = sr
<<<<<<< HEAD

=======
        self.exp = exp
        self.filter_fix = filter_fix

        requires_grad = not filter_fix
>>>>>>> Server/Server
        input_freq = np.linspace(0, self.sr / 2, input_dim)
        self.input_freq = nn.Parameter(torch.from_numpy(input_freq).expand(num_filter, input_dim).float(),
                                       requires_grad=False)

        centers = np.linspace(0, hz2mel(sr / 2), num_filter + 2)
        centers = mel2hz(centers)
<<<<<<< HEAD
        self.frequency_center = nn.Parameter(torch.from_numpy(centers[1:-1]).float().reshape(num_filter, 1))
=======
        self.frequency_center = nn.Parameter(torch.from_numpy(centers[1:-1]).float().reshape(num_filter, 1),
                                             requires_grad=requires_grad)
>>>>>>> Server/Server

        bandwidth = []
        for i in range(2, len(centers)):
            bandwidth.append(centers[i] - centers[i - 1])
<<<<<<< HEAD
        self.bandwidth = nn.Parameter(torch.tensor(bandwidth).reshape(num_filter, 1).float())
        self.gain = nn.Parameter(torch.ones(num_filter, dtype=torch.float32).reshape(num_filter, 1))