def centerlized_mfcc(lowFreq=0, middleFreq=4800, highFreq=8000): #TODO: Make the numbers more general instead of the hardcoded that they are fb1 = get_filterbanks(nfilt=15, nfft=512, samplerate=16000, lowfreq=0, highfreq=middleFreq) fb2 = get_filterbanks(nfilt=11, nfft=512, samplerate=16000, lowfreq=0, highfreq=(highFreq - middleFreq)) fb = numpy.zeros(257) for filter in fb1: newFilter = filter[0:150] newFilter = newFilter[::-1] zeros = numpy.zeros(len(filter) - 150) newFilter = numpy.concatenate((newFilter, zeros), axis=0) fb = numpy.vstack((fb, newFilter)) for filter in fb2: zeros = numpy.zeros(146) newFilter = filter[0:111] newFilter = numpy.concatenate((zeros, newFilter), axis=0) fb = numpy.vstack((fb, newFilter)) return fb[ 1: 27] #its time to give up the first element (the zeros)... I used it only for the initial structure
def mel_filterbank_callback(testing, map, iteration, context): size, num_filters, sample_rate = context try: from python_speech_features import get_filterbanks except: print("### skiping test_mel_filterbank because 'python_speech_features' module is not available") return fbanks = get_filterbanks(num_filters, size, sample_rate) input = np.array(range(size)).astype(np.float) chopped = input[0:fbanks.shape[1]] expected = np.dot(chopped, fbanks.T) output = map.Compute(input) testing.ProcessTest("test_mel_filterbank compute iteration {}".format(iteration), np.allclose(output, expected)) compiler_settings = ell.model.MapCompilerOptions() compiler_settings.useBlas = False # not resolvable on our Linux test machines... optimizer_options = ell.model.ModelOptimizerOptions() compiled_map = map.Compile("host", "hammingtest", "predict", compiler_settings, optimizer_options) compiled_output = compiled_map.Compute(input) testing.ProcessTest("test_mel_filterbank compiled iteration {}".format(iteration), np.allclose(compiled_output, expected)) return compiled_output
def mel_bankm(fs, nfft, mel_num, fmin=0.0, fmax=None): # bank = filters.mel(sr=fs, n_fft=nfft, n_mels=mel_num, fmin=fmin, fmax=fmax, norm=None) bank = get_filterbanks(nfilt=mel_num, nfft=nfft, samplerate=fs, lowfreq=fmin, highfreq=fmax) return bank
def filterbanks(sample_rate, nfilt, nfft, fft_bins_2_freq): fb = get_filterbanks(nfilt, nfft, sample_rate) for filter in fb: plt.plot(fft_bins_2_freq, filter) plt.ylabel('Skaalausarvo') plt.xlabel('Taajuus (Hz)') plt.show()
def fbank_from_complex_spec(complex_spec, nfilt=64, nfft=512, sample_rate=16000): import python_speech_features power = 1 / nfft * np.square(complex_spec).real fb = python_speech_features.get_filterbanks(nfilt, nfft, sample_rate) feat = np.dot(power, fb.T) feat = np.where(feat == 0, np.finfo(float).eps, feat) return feat.astype('float32')
def mfcc(frames,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True): pspec = sigproc.powspec(frames,nfft) energy = numpy.sum(pspec,1) # this stores the total energy in each frame energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log fb = python_speech_features.get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log feat = numpy.log(feat) feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep] feat = python_speech_features.lifter(feat,ceplifter) if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy return feat
def _transform(self, spectrogram): '''Transform STFT features into log mel-frequency filterbank features''' # handle multiple spectrograms at once spec_dim = len(spectrogram.shape) # Move time and frequency to the end of dim list, # if there are other dimensions if spec_dim > 2: new_shape = [*range(2, spec_dim), 0, 1] spectrogram = np.transpose(spectrogram, new_shape) # freely adapted from python_speech_features logfbank fb = psf.get_filterbanks(self.num_filters, self.num_fft, self.sample_rate, 0, self.sample_rate / 2) mag_spec = np.absolute(spectrogram) pow_spec = 1.0 / self.num_fft * np.square(mag_spec) energies = np.dot(pow_spec, fb.T) # compute the filterbank energies energies = np.where(energies == 0, np.finfo(float).eps, energies) log_energies = np.log(energies) # Move time and frequency back to the start of the dim list if spec_dim > 2: old_shape = [ spec_dim - 2, spec_dim - 1, *list(range(spec_dim - 2)) ] print(old_shape) log_energies = np.transpose(log_energies, old_shape) # get diff features if self.diff_features: first_diff = np.diff(log_energies, 1, axis=0) # Zero-pad beginning first_diff = np.concatenate( (np.zeros((1, *first_diff.shape[1:]), dtype=log_energies.dtype), first_diff), axis=0) second_diff = np.diff(log_energies, 2, axis=0) second_diff = np.concatenate( (np.zeros((2, *second_diff.shape[1:]), dtype=log_energies.dtype), second_diff), axis=0) # Concatenate in frequency # This is a weird thing to do, not sure if would be better to # have on its own dimension log_energies = np.concatenate( (log_energies, first_diff, second_diff), axis=1) return log_energies
from audio import get_features from audio import pitch_conversion from audio import mfe2sp from constants import MODEL from constants import SAMPLE_RATE from constants import EMB_FRAMES from constants import NUM_FBANKS from constants import embedder_model from constants import PREEMPH from constants import FRAME_PERIOD from constants import NFFT import embedder.embedding_model as embedding_model import embedder.embedder_utils as ut from audio import quantize fb = get_filterbanks(NUM_FBANKS, NFFT, SAMPLE_RATE, lowfreq=0, highfreq=None) filter_centers = fb.argmax(axis=-1) preemph_transform = np.abs(fft([1., -PREEMPH] + [0] * (NFFT - 2)))[:NFFT // 2 + 1]**2 def convert_voice(model, wav_s, wav_t, emb_s, emb_t): """Arguments: cvae - ACVAE model embedder - DeepSpeakerModel wav_s - source voice wav_t - target voice Returns: wav file with words from source voice, voice from target voice """ pic_dir = "../figure/"
# test power spectrum (the first one at least) powspec = psf.sigproc.powspec(frames, 2048) results['powspec'] = powspec[0].tolist() # test filterbank hz2mel = lambda hz: 2595 * np.log10(1 + hz / 700.) mel2hz = lambda mel: 700 * (10**(mel / 2595.0) - 1) highMel = hz2mel(sampleRate / 2) #print("highMel = %s" % highMel) melpoints = np.linspace(0, highMel, 26 + 2) results['bins'] = np.floor( (2048 + 1) * mel2hz(melpoints) / sampleRate).tolist() filterbank = psf.get_filterbanks(nfilt=26, nfft=2048, samplerate=sampleRate, lowfreq=0, highfreq=None) results['filters'] = filterbank.tolist() feat, energy = psf.fbank(pcm, sampleRate, mfccWinlen, mfccStepT, 26, 2048, 0, None, 0.97, winfunc) results['feat'] = feat.tolist() results['energy'] = energy.tolist() # test dct results['dct'] = dct(np.log(feat), type=2, axis=1, norm='ortho')[:, :mfccNceps].tolist() # test mfcc ceps = psf.mfcc(pcm, samplerate=sampleRate,
import numpy as np import seaborn as sns from matplotlib import pyplot as plt from python_speech_features import get_filterbanks, hz2mel plt.rc('text', usetex=True) plt.rc('font', family='serif') nfilt, nfft, samplerate, lowfreq, highfreq = 7, 512, 16000, 0, 8000 fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq) fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 3)) colors = sns.cubehelix_palette(7, start=2, rot=0, dark=0.1, light=.7) x = np.arange(0, 8001, 1) y = [hz2mel(i) for i in x] ax1.scatter(1000, 1000, s=30, color='red', alpha=0.9) ax1.vlines(1000, ymin=0, ymax=1000, alpha=0.8, color='red', linestyle='--', linewidth=1) ax1.hlines(1000, xmin=0, xmax=1000, alpha=0.8, color='red', linestyle='--', linewidth=1)
assert (get_error(psf.hz2mel(16000), csf.hz2mel(16000)) <= acceptable_error) assert (get_error(csf.mel2hz(csf.hz2mel(8000)), 8000) <= acceptable_error) print ' ✓' print '' print 'mel2hz' print '======' assert (get_error(psf.mel2hz(2595), csf.mel2hz(2595)) <= acceptable_error) assert (get_error(csf.mel2hz(5190), csf.mel2hz(5190)) <= acceptable_error) assert (get_error(csf.hz2mel(csf.mel2hz(2595)), 2595) <= acceptable_error) print ' ✓' print '' print 'get_filterbanks' print '===============' psf_filterbanks = psf.get_filterbanks() csf_filterbanks = csf.get_filterbanks() assert (np.shape(psf_filterbanks) == np.shape(csf_filterbanks)) error2d(psf_filterbanks, csf_filterbanks) print '' print 'lifter' print '======' psf_lifter = psf.lifter(psf_feat) csf_lifter = csf.lifter(np.array(psf_feat, dtype=np.float32)) assert (np.shape(psf_lifter) == np.shape(csf_lifter)) error2d(psf_lifter, csf_lifter) print '' print 'delta' print '====='
import argparse import librosa import numpy as np from tqdm import tqdm from os.path import join, isfile from joblib import Parallel, delayed from python_speech_features import get_filterbanks, sigproc samplerate = 16000 nfft = 512 winlen = 0.025 * samplerate winstep = 0.01 * samplerate banks = get_filterbanks(40, nfft, samplerate).transpose() def job(input_name, output_name): audio, _ = librosa.load(input_name, mono=True, sr=samplerate) if len(audio) == 0: return False signal = sigproc.preemphasis(audio, 0.97) x = sigproc.framesig(signal, winlen, winstep, np.hanning) if len(x) == 0: return False x = sigproc.powspec(x, nfft) x = np.dot(x, banks) x = np.where(x == 0, np.finfo(float).eps, x) x = np.log(x).astype(dtype=np.float32) if np.isnan(np.sum(x)): return False np.save(output_name, x) return True