Exemplo n.º 1
0
 def __init__(self,
              n_fft=2048,
              hop_length=1024,
              n_mels=128,
              n_mfcc=40,
              norm='ortho',
              sample_rate=16000,
              f_min=40,
              f_max=7600,
              pad_end=True,
              center=False):
     """
     uses log mels
     """
     super().__init__()
     self.norm = norm
     self.n_mfcc = n_mfcc
     self.melspec = MelSpec(n_fft,
                            hop_length,
                            n_mels,
                            sample_rate,
                            power=2,
                            f_min=f_min,
                            f_max=f_max,
                            pad_end=pad_end,
                            center=center)
     dct_mat = create_dct(self.n_mfcc, self.melspec.n_mels, self.norm)
     self.register_buffer('dct_mat', dct_mat)
Exemplo n.º 2
0
    def __init__(self,
                 sample_rate: int = 16000,
                 n_mfcc: int = 40,
                 dct_type: int = 2,
                 norm: str = 'ortho',
                 log_mels: bool = False,
                 melkwargs: Optional[dict] = None) -> None:
        super(MFCC, self).__init__()
        supported_dct_types = [2]
        if dct_type not in supported_dct_types:
            raise ValueError('DCT type not supported: {}'.format(dct_type))
        self.sample_rate = sample_rate
        self.n_mfcc = n_mfcc
        self.dct_type = dct_type
        self.norm = norm
        self.top_db = 80.0
        self.amplitude_to_DB = AmplitudeToDB('power', self.top_db)

        if melkwargs is not None:
            self.MelSpectrogram = MelSpectrogram(sample_rate=self.sample_rate,
                                                 **melkwargs)
        else:
            self.MelSpectrogram = MelSpectrogram(sample_rate=self.sample_rate)

        if self.n_mfcc > self.MelSpectrogram.n_mels:
            raise ValueError(
                'Cannot select more MFCC coefficients than # mel bins')
        dct_mat = F.create_dct(self.n_mfcc, self.MelSpectrogram.n_mels,
                               self.norm)
        self.register_buffer('dct_mat', dct_mat)
        self.log_mels = log_mels
Exemplo n.º 3
0
 def __init__(self, sample_rate: int, mel_size: int, n_fft: int, win_length: int, n_mfcc: int,
              hop_length: int, min_db: float, max_db: float,
              mel_min: float = 0., mel_max: float = None, norm: str = 'ortho'):
     super().__init__()
     self.n_mfcc = n_mfcc
     self.mel_func = LogMelSpectrogram(
         sample_rate, mel_size, n_fft, win_length, hop_length, min_db, max_db,
         mel_min, mel_max
     )
     dct_mat = audio_func.create_dct(n_mfcc, mel_size, norm)
     self.register_buffer('dct_mat', dct_mat.transpose(0, 1))
Exemplo n.º 4
0
    def test_torchscript_create_dct(self):
        @torch.jit.script
        def jit_method(n_mfcc, n_mels, norm):
            # type: (int, int, Optional[str]) -> Tensor
            return F.create_dct(n_mfcc, n_mels, norm)

        n_mfcc = 40
        n_mels = 128
        norm = 'ortho'

        jit_out = jit_method(n_mfcc, n_mels, norm)
        py_out = F.create_dct(n_mfcc, n_mels, norm)

        self.assertTrue(torch.allclose(jit_out, py_out))
Exemplo n.º 5
0
def mfcc(signal,
         samplerate=16000,
         winlen=0.025,
         hoplen=0.01,
         numcep=13,
         nfilt=26,
         nfft=None,
         lowfreq=0,
         highfreq=None,
         preemph=0.97,
         ceplifter=22,
         plusEnergy=True,
         dct=None,
         winfunc=lambda x: torch.ones((x, ), device=device)):
    """
    Compute MFCC from an audio signal.
    :param signal: (time,)
    :param samplerate:
    :param winlen:
    :param hoplen:
    :param numcep: The number of cepstrum to retain.
    :param nfilt:
    :param nfft:
    :param lowfreq:
    :param highfreq:
    :param preemph:
    :param ceplifter:
    :param plusEnergy:
    :param winfunc:
    :return: (nframes, numcep)
    """
    nfft = nfft or calculate_nfft(samplerate, winlen)
    feat, energy = fbank(signal, samplerate, winlen, hoplen, nfilt, nfft,
                         lowfreq, highfreq, preemph, winfunc)
    feat = torch.log(feat)
    if not dct:
        dct = AF.create_dct(numcep, nfilt, norm='ortho').to(device)
    feat = feat.mm(dct)
    feat = lifter(feat, ceplifter)
    if plusEnergy: feat[:, 0] = torch.log(energy)
    return feat
 def func(_):
     n_mfcc = 40
     n_mels = 128
     norm = "ortho"
     return F.create_dct(n_mfcc, n_mels, norm)
Exemplo n.º 7
0
sample_rate = 16000
n_mfcc = 40
dct_type = 2
norm = 'ortho'
log_mels = False

sample_rate = sample_rate
n_mfcc = n_mfcc
dct_type = dct_type
norm = norm
top_db = 80.0
amplitude_to_DB = torchaudio.transforms.AmplitudeToDB('power', top_db)
MelSpectrogram = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate)

dct_mat = F.create_dct(n_mfcc, MelSpectrogram.n_mels, norm)

tm = torchaudio.transforms.MFCC()
a = tm(torch.tensor(np.sin(np.arange(1, 1000)), dtype=torch.float))

r.a = a

# pack batch
shape = waveform.size()
waveform = waveform.reshape(-1, shape[-1])

mel_specgram = MelSpectrogram(waveform)
if log_mels:
    log_offset = 1e-6
    mel_specgram = torch.log(mel_specgram + log_offset)
else:
Exemplo n.º 8
0
 def jit_method(n_mfcc, n_mels, norm):
     # type: (int, int, Optional[str]) -> Tensor
     return F.create_dct(n_mfcc, n_mels, norm)
Exemplo n.º 9
0
    LFR_inputs = []
    T = inputs.shape[0]
    T_lfr = int(np.ceil(T / n))
    for i in range(T_lfr):
        if m <= T - i * n:
            LFR_inputs.append(np.hstack(inputs[i * n:i * n + m]))
        else:  # process last LFR frame
            num_padding = m - (T - i * n)
            frame = np.hstack(inputs[i * n:])
            for _ in range(num_padding):
                frame = np.hstack((frame, inputs[-1]))
            LFR_inputs.append(frame)
    return np.vstack(LFR_inputs)


dct_mat = F.create_dct(48, 80, 'ortho')
print('MFCC 80 -> 48 LFR')


def build_LFR_features(inputs, m, n):
    """
    Actually, this implements stacking frames and skipping frames.
    if m = 1 and n = 1, just return the origin features.
    if m = 1 and n > 1, it works like skipping.
    if m > 1 and n = 1, it works like stacking but only support right frames.
    if m > 1 and n > 1, it works like LFR.
    Args:
        inputs_batch: inputs is T x D np.ndarray
        m: number of frames to stack
        n: number of frames to skip
    """
Exemplo n.º 10
0
 def __init__(self, n_mfcc: int, mel_size: int, norm: str = 'ortho'):
     super().__init__()
     self.n_mfcc = n_mfcc
     dct_mat = audio_func.create_dct(n_mfcc, mel_size, norm)
     self.register_buffer('dct_mat', dct_mat.transpose(0, 1))