def test_frame_sig(self): n = 10000124 frame_len = 37 frame_step = 13 x = np.random.rand(n) t0 = time.time() y_old = sigproc.framesig(x, frame_len=frame_len, frame_step=frame_step, stride_trick=False) t1 = time.time() y_new = sigproc.framesig(x, frame_len=frame_len, frame_step=frame_step, stride_trick=True) t_new = time.time() - t1 t_old = t1 - t0 self.assertTupleEqual(y_old.shape, y_new.shape) np.testing.assert_array_equal(y_old, y_new) self.assertLess(t_new, t_old) print('new run time %3.2f < %3.2f sec' % (t_new, t_old))
def get_fft_spectrum(filename, buckets): signal = load_wav(filename, c.SAMPLE_RATE) signal *= 2**15 # print(filename) print(buckets) while (len(signal) / (c.FRAME_STEP * c.SAMPLE_RATE) < 101): signal = np.append(signal, 0) # get FFT spectrum signal = remove_dc_and_dither(signal, c.SAMPLE_RATE) signal = sigproc.preemphasis(signal, coeff=c.PREEMPHASIS_ALPHA) frames = sigproc.framesig(signal, frame_len=c.FRAME_LEN * c.SAMPLE_RATE, frame_step=c.FRAME_STEP * c.SAMPLE_RATE, winfunc=np.hamming) fft = abs(np.fft.fft(frames, n=c.NUM_FFT)) # print(len(fft)) fft_norm = normalize_frames(fft.T) # print(len(fft_norm.T)) # truncate to max bucket sizes rsize = max(k for k in buckets if k <= len(fft_norm.T)) # print(rsize) rstart = int((len(fft_norm.T) - rsize) / 2) # print(rstart) out = fft_norm[:, rstart:rstart + rsize] # print(len(out)) return out
def ssc(signal, samplerate=16000, winlen=0.025, winstep=0.01, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, winfunc=lambda x: np.ones((x, ))): highfreq = highfreq or samplerate / 2 signal = sigproc.preemphasis(signal, preemph) frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc) pspec = sigproc.powspec(frames, nfft) pspec = np.where(pspec == 0, np.finfo(float).eps, pspec) # if things are all zeros we get problems fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq) feat = np.dot(pspec, fb.T) # compute the filterbank energies R = np.tile(np.linspace(1, samplerate / 2, np.size(pspec, 1)), (np.size(pspec, 0), 1)) return np.dot(pspec * R, fb.T) / feat
def read_and_process_audio(filename, buckets): signal = read_audio(filename, c.SAMPLE_RATE) # # Filter out non-speech frequencies # lowcut, highcut = c.FILTER_RANGE # signal = butter_bandpass_filter(signal, lowcut, highcut, c.SAMPLE_RATE, 1) # # Normalize signal # signal = normalize(signal) signal *= 2**15 # Process signal to get FFT spectrum signal = rm_dc_n_dither(signal, c.SAMPLE_RATE) signal = sigproc.preemphasis(signal, coeff=c.PREEMPHASIS_ALPHA) frames = sigproc.framesig(signal, frame_len=c.FRAME_LEN * c.SAMPLE_RATE, frame_step=c.FRAME_STEP * c.SAMPLE_RATE, winfunc=np.hamming) fft = abs(np.fft.fft(frames, n=c.NUM_FFT)) fft_norm = normalize_frames(fft.T) # Truncate to middle MAX_SEC seconds rsize = max(k for k in buckets if k <= fft_norm.shape[1]) rstart = int((fft_norm.shape[1] - rsize) / 2) out = fft_norm[:, rstart:rstart + rsize] return out
def get_fft_spectrum(filename, buckets): signal = load_wav(filename, c.SAMPLE_RATE) signal *= 2**15 # get FFT spectrum (applying hamming) signal = remove_dc_and_dither(signal, c.SAMPLE_RATE) signal = sigproc.preemphasis(signal, coeff=c.PREEMPHASIS_ALPHA) frames = sigproc.framesig(signal, frame_len=c.FRAME_LEN * c.SAMPLE_RATE, frame_step=c.FRAME_STEP * c.SAMPLE_RATE, winfunc=np.hamming) fft = abs(np.fft.fft(frames, n=c.NUM_FFT)) fft_norm = normalize_frames( fft.T ) #TODO may remove variance normalization to see other poorer results # truncate to max bucket sizes rsize = max(k for k in buckets if k <= fft_norm.shape[1]) rstart = int((fft_norm.shape[1] - rsize) / 2) out = fft_norm[:, rstart:rstart + rsize] # print("fft_spectrum shape{}".format(out.shape)) # if(out.shape[1] == c.MAX_SEC*100): # save_fft_spectrum(filename,out) return out
def logspec(signal, samplerate, conf): ''' Compute log magnitude spectrogram features from an audio signal. Args: signal: the audio signal from which to compute features. Should be an N*1 array samplerate: the samplerate of the signal we are working with. conf: feature configuration Returns: A numpy array of size (NUMFRAMES by numfreq) containing features. Each row holds 1 feature vector, a numpy vector containing the log magnitude spectrum of the corresponding frame ''' signal = sigproc.preemphasis(signal, float(conf['preemph'])) winfunc = _get_winfunc(conf['winfunc']) frames = sigproc.framesig(signal, float(conf['winlen']) * samplerate, float(conf['winstep']) * samplerate, winfunc) logspec = sigproc.logmagspec(frames, int(conf['nfft'])) return logspec
def fbank(signal, samplerate=16000, winlen=0.025, winstep=0.01, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, winfunc=lambda x: np.ones((x, ))): highfreq = highfreq or samplerate / 2 signal = sigproc.preemphasis(signal, preemph) frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc) pspec = sigproc.powspec(frames, nfft) energy = np.sum(pspec, 1) # this stores the total energy in each frame energy = np.where( energy == 0, np.finfo(float).eps, energy) # if energy is zero, we get problems with log fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq) feat = np.dot(pspec, fb.T) # compute the filterbank energies feat = np.where(feat == 0, np.finfo(float).eps, feat) # if feat is zero, we get problems with log return feat, energy
def angspec(signal, samplerate, conf): """ Compute angular spectrogram features from an audio signal. Args: signal: the audio signal from which to compute features. Should be an N*1 array samplerate: the samplerate of the signal we are working with. conf: feature configuration Returns: A numpy array of size (NUMFRAMES by numfreq) containing features. Each row holds 1 feature vector, a numpy vector containing the angular spectrum of the corresponding frame """ raise BaseException('Not yet implemented') signal = sigproc.preemphasis(signal, float(conf['preemph'])) winfunc = _get_winfunc(conf['winfunc']) frames = sigproc.framesig(signal, float(conf['winlen']) * samplerate, float(conf['winstep']) * samplerate, winfunc) angspec = sigproc.angspec(frames, int(conf['nfft'])) return angspec
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): """Compute Mel-filterbank energy features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between seccessive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 20. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The second return value is the energy in each frame (total energy, unwindowed) """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate) pspec = sigproc.powspec(frames,nfft) energy = numpy.sum(pspec,1) # this stores the total energy in each frame fb = get_filterbanks(nfilt,nfft,samplerate) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies return feat,energy
def spec_sub(signal): NFFT = 1024 frames = sigproc.framesig(signal, 256, 128) print(frames.shape) cspec = np.fft.fft(frames, NFFT) pspec = abs(cspec) print(pspec.shape) pspec *= pspec phase = np.angle(cspec) noise_est = np.mean(pspec[40:50]) print(noise_est) clean_spec = pspec - noise_est print("1") #print (clean_spec) clean_spec[clean_spec < 0] = 0 print("2") #print(clean_spec) clean_spec **= 0.5 clean_spec *= np.exp(phase) print("3") #print(clean_spec) reconstructed_frames = np.fft.ifft(clean_spec, NFFT) reconstructed_frames = np.real(reconstructed_frames) print(reconstructed_frames.shape) reconstructed_frames = reconstructed_frames[ 0:reconstructed_frames.shape[0], 0:256] print(reconstructed_frames.shape) #print(reconstructed_frames.shape) #print(reconstructed_frames) enhanced_signal = sigproc.deframesig(reconstructed_frames, len(signal), 256, 128) #print(enhanced_signal) return enhanced_signal
def extract(filtered_audio, Fs): nfft = 256 num_bins = 40 start_frequency = 150 end_frequency = 3200 c = filtered_audio.shape[0] features = [] for i in range(c): framed_audio = sigproc.framesig(filtered_audio[i], 256, 128) features.append([]) j = framed_audio.shape[0] if j > 10: req_frames = framed_audio[int(j / 2) - 5:int(j / 2) + 5] print(req_frames) for k in range(len(req_frames)): peak_amp, peak_freq = sp_peak_amp_freq.peakFreq(req_frames[k], 50) pitch_periods = sp_pitch_period.pitch_period(req_frames[k], Fs) form = formants.formant(req_frames[k]) cep = LPCC.lpcc(req_frames[k]) real_cc = RCC.rcc(req_frames[k]) lsfs = lsf.LSF(req_frames[k]) hjorth_parameters = hjorth.params(req_frames[k]) wavelet = dwt.wenergy(req_frames[k], 'db7', 5) features[i].extend(lsfs) features[i].extend(hjorth_parameters) features[i].extend(wavelet) return features
def RT_CNN(): print("Loading model weights from [{}]....".format(c.WEIGHTS_FILE)) model = vggvox_model() # Creates a VGGVox model model.load_weights( c.WEIGHTS_FILE) # Load the weights of the trained models model.summary() # Print a summary of the loaded model print("Loading embeddings from enroll") toLoad = load("data/model/RTSP_CNN.out") enroll_embs = [] speakers = [] for spk, embs in toLoad.items(): for e in embs: enroll_embs.append(e) speakers.append(spk) print(spk) count = 0 buffer = AudioBuffer() start_time = time.time() while count < 3: count += 1 buffer.record(chunk_size=c.SAMPLE_RATE) data = buffer.get_data() data = np.frombuffer(data, 'int16') buckets = build_buckets(c.MAX_SEC, c.BUCKET_STEP, c.FRAME_STEP) data *= 2**15 while (len(data) / (c.FRAME_STEP * c.SAMPLE_RATE) < 101): data = np.append(data, 0) # get FFT spectrum data = remove_dc_and_dither(data, c.SAMPLE_RATE) data = sigproc.preemphasis(data, coeff=c.PREEMPHASIS_ALPHA) frames = sigproc.framesig(data, frame_len=c.FRAME_LEN * c.SAMPLE_RATE, frame_step=c.FRAME_STEP * c.SAMPLE_RATE, winfunc=np.hamming) fft = abs(np.fft.fft(frames, n=c.NUM_FFT)) fft_norm = normalize_frames(fft.T) # truncate to max bucket sizes rsize = max(k for k in buckets if k <= len(fft_norm.T)) rstart = int((len(fft_norm.T) - rsize) / 2) x = fft_norm[:, rstart:rstart + rsize] test_embs = np.squeeze(model.predict(x.reshape(1, *x.shape, 1))) distances = [] for embs in enroll_embs: distances.append(euclidean(test_embs, embs)) print(len(speakers)) idx = np.argmin(distances) print(speakers[idx]) print("Ok, ", time.time() - start_time - 3, " seconds")
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): """Compute Spectral Subband Centroid features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate) pspec = sigproc.powspec(frames,nfft) pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1)) return numpy.dot(pspec*R,fb.T) / feat
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, winfunc=lambda x:np.ones((x,))): """Compute Spectral Subband Centroid features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the sample rate of the signal we are working with, in Hz. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) pspec = sigproc.powspec(frames,nfft) pspec = np.where(pspec == 0,np.finfo(float).eps,pspec) # if things are all zeros we get problems fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = np.dot(pspec,fb.T) # compute the filterbank energies R = np.tile(np.linspace(1,samplerate/2,np.size(pspec,1)),(np.size(pspec,0),1)) return np.dot(pspec*R,fb.T) / feat
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): """Compute Spectral Subband Centroid features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between seccessive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 20. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate) pspec = sigproc.powspec(frames,nfft) fb = get_filterbanks(nfilt,nfft,samplerate) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1)) return numpy.dot(pspec*R,fb.T) / feat
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, winfunc=lambda x:np.ones((x,))): """Compute Mel-filterbank energy features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the sample rate of the signal we are working with, in Hz. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The second return value is the energy in each frame (total energy, unwindowed) """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) pspec = sigproc.powspec(frames,nfft) energy = np.sum(pspec,1) # this stores the total energy in each frame energy = np.where(energy == 0,np.finfo(float).eps,energy) # if energy is zero, we get problems with log fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = np.dot(pspec,fb.T) # compute the filterbank energies feat = np.where(feat == 0,np.finfo(float).eps,feat) # if feat is zero, we get problems with log return feat,energy
def get_fft_spectrum(filename, start, end): signal = load_wav(filename, c.SAMPLE_RATE) signal *= 2**15 # get FFT spectrum signal = remove_dc_and_dither(signal, c.SAMPLE_RATE) # 数字滤波器,去除直流和颤动成分 signal = sigproc.preemphasis(signal, coeff=c.PREEMPHASIS_ALPHA) # 对输入信号进行预加重 frames = sigproc.framesig(signal, frame_len=c.FRAME_LEN * c.SAMPLE_RATE, frame_step=c.FRAME_STEP * c.SAMPLE_RATE, winfunc=np.hamming) # 将信号框成重叠帧 # print("===================") # print(frames.shape) # print("===================") # exit(0) spem = sigproc.logpowspec(frames, c.NUM_FFT) # 计算语谱图 # print("===================") # print(spem) # print("===================") # print(spem.shape) # print("===================") # exit(0) spem_norm = normalize_frames(spem.T) # 减去均值,除以标准差 length = spem_norm.shape[1] reserve_length = length - (length % 100) # out = fft_norm[:,0:reserve_length] # test out = spem_norm[:, start:end] # train return out
def get_fft_spectrum(filename, buckets): # load the signal with librosa signal = load_wav(filename, c.SAMPLE_RATE) # multiply the signal to get the power of 2? signal *= 2**15 # get FFT spectrum # not sure what functions below do signal = remove_dc_and_dither(signal, c.SAMPLE_RATE) signal = sigproc.preemphasis(signal, coeff=c.PREEMPHASIS_ALPHA) # build frames to pass in fft frames = sigproc.framesig(signal, frame_len=c.FRAME_LEN * c.SAMPLE_RATE, frame_step=c.FRAME_STEP * c.SAMPLE_RATE, winfunc=np.hamming) # get fft spetctrum fft = abs(np.fft.fft(frames, n=c.NUM_FFT)) # normalize each frame by mean and std fft_norm = normalize_frames(fft.T) # truncate to max bucket sizes # ???? rsize = max(k for k in buckets if k <= fft_norm.shape[1]) rstart = int((fft_norm.shape[1] - rsize) / 2) out = fft_norm[:, rstart:rstart + rsize] return out
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): """Compute Mel-filterbank energy features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The second return value is the energy in each frame (total energy, unwindowed) """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate) pspec = sigproc.powspec(frames,nfft) energy = numpy.sum(pspec,1) # this stores the total energy in each frame energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log return feat,energy
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): """Compute Mel-filterbank energy features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The second return value is the energy in each frame (total energy, unwindowed) """ highfreq= highfreq or samplerate/2 print "preemph %s"%(preemph) signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate) matchframes(frames[0], frames[1]) pspec = sigproc.powspec(frames,nfft) energy = pylab.sum(pspec,1) # this stores the total energy in each frame energy = pylab.where(energy == 0, pylab.finfo(float).eps, energy) # if energy is zero, we get problems with log fb = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq) print "len(fb) %s"%(len(fb)) colour = "k-" for i in range(len(fb)): if colour == "k-": colour = "r-" else: colour = "k-" startedplot = False midpoint = 0 for j in range(len(fb[i])): if fb[i][j] > 0: if startedplot == False: startedplot = j if j > 0: pylab.plot([j-1, j], [fb[i][j-1], fb[i][j]], colour) if fb[i][j] == 1.0: midpoint = j else: if not startedplot == False: pylab.plot([j-1, j], [fb[i][j-1], 0], colour) try: print "slope to midpoint %.3f, slope from midpoint %.3f"%(1.0/float(midpoint-startedplot), 1.0/float(midpoint-j+1)) except: pass break pylab.show() feat = pylab.dot(pspec, fb.T) # compute the filterbank energies feat = pylab.where(feat == 0, pylab.finfo(float).eps, feat) # if feat is zero, we get problems with log return feat, energy
def vad(sig, rate, winlen, winstep): '''do voice activity detection args: sig: the input signal as a numpy array rate: the sampling rate winlen: the window length winstep: the window step Returns: a numpy array of indices containing speech frames ''' #apply preemphasis sig = sigproc.preemphasis(sig, 0.97) #do windowing windowing frames = sigproc.framesig(sig, winlen * rate, winstep * rate) #compute the squared frames and center them around zero mean sqframes = np.square(frames) sqframes = sqframes - sqframes.mean(1, keepdims=True) #compute the cross correlation between the frames and their square corr = np.array(map(partial(np.correlate, mode='same'), frames, sqframes)) #compute the mel power spectrum of the correlated signal corrfft = np.fft.rfft(corr, 512) fb = base.get_filterbanks(26, 512, rate, 0, rate / 2) E = np.absolute(np.square(corrfft).dot(fb.T)) #do noise sniffing at the front and the back and select the lowest energy Efront = E[:20, :].mean(0) Eback = E[-20:, :].mean(0) if Efront.sum() < Eback.sum(): Enoise = Efront else: Enoise = Eback #at every interval compute the mean ratio between the maximal energy in that #interval and the noise energy width = 12 #apply max pooling to the energy Emax = maximum_filter(E, size=[width, 1], mode='constant') #compute the ratio between the smoothed energy and the noise energy ratio = np.log((Emax / Enoise).mean(axis=1)) ratio = ratio / np.max(ratio) speechframes = np.where(ratio > 0.2)[0] return speechframes
def admission(x, samplerate=8000, frame_ms=25, overlap_ms=5): frame_len = int(samplerate / frame_ms) overlap = int(samplerate / overlap_ms) frames = sigutil.framesig(x, frame_len, overlap, signal.hanning) acpeaks = ac_peaks(frames) f0 = f0_acf(frames) rse = RSE_soundsense(x, samplerate, frame_ms) x_timebased = np.linspace(0, len(frames) * frame_len, len(frames)) x_rse = np.linspace(0, len(frames) * frame_len, len(frames)) return {"acpeaks": acpeaks, "f0": f0, "rse": rse}
def admission(x, samplerate=8000, frame_ms=25, overlap_ms=5): frame_len = int(samplerate/frame_ms) overlap = int(samplerate/overlap_ms) frames = sigutil.framesig(x, frame_len, overlap, signal.hanning) acpeaks = ac_peaks(frames) f0 = f0_acf(frames) rse = RSE_soundsense(x, samplerate, frame_ms) x_timebased = np.linspace(0,len(frames)*frame_len, len(frames)) x_rse = np.linspace(0,len(frames)*frame_len, len(frames)) return {"acpeaks":acpeaks, "f0":f0, "rse":rse}
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate) pspec = sigproc.powspec(frames,nfft) pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1)) return numpy.dot(pspec*R,fb.T) / feat
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate) pspec = sigproc.powspec(frames,nfft) energy = numpy.sum(pspec,1) # this stores the total energy in each frame energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log return feat,energy
def fbank(signal, samplerate=16000, winlen=0.025, winstep=0.01, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.95): """ Compute Mel-filterbank energy features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.95. :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The second return value is the energy in each frame (total energy, unwindowed) """ highfreq = highfreq or samplerate / 2 signal = sigproc.preemphasis(signal, preemph) # print type(signal[0]) frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc=hamming_window) powspec = sigproc.powspec(frames, nfft) # numpy.savetxt("result.txt", powspec, delimiter=",") energy = numpy.sum(powspec, 1) # this stores the total energy in each frame energy = numpy.where( energy == 0, numpy.finfo(float).eps, energy ) # if energy is zero, we get problems with log, use numpy.finfo(float).eps to replace 0 filterbanks = get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq) # print powspec.shape, filterbanks.shape feat = numpy.dot(powspec, filterbanks.T) # compute the filterbank energies feat = numpy.where(feat == 0, numpy.finfo(float).eps, feat) # if feat is zero, we get problems with logs # print feat.shape return feat, energy
def get_fft_spectrum(signal): # padding zero n_sample = signal.shape[0] singal_len = int(c.DURA*c.SR) if n_sample < singal_len: signal = np.hstack((signal,np.zeros(singal_len-n_sample))) else: signal = signal[(n_sample-singal_len)//2:(n_sample+singal_len)//2] signal = np.array(signal) signal *= 2**15 signal = remove_dc_and_dither(signal,c.SR) signal = sigproc.preemphasis(signal,coeff=c.PREEMPHASIS_ALPHA) frames = sigproc.framesig(signal,frame_len=c.FRAME_LEN*c.SR,frame_step=c.FRAME_STEP*c.SR,winfunc=np.hamming) fft = abs(np.fft.fft(frames,n= c.N_FFT)) fft_norm = normalization_frames(fft.T) return fft_norm
def fbankVTLP(signal, samplerate=16000, winlen=0.025, winstep=0.01, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, appendEnergy=False, alpha=1.0): """Compute Mel-filterbank energy features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The second return value is the energy in each frame (total energy, unwindowed) """ highfreq = highfreq or samplerate / 2 signal = sigproc.preemphasis(signal, preemph) frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate) pspec = sigproc.powspec(frames, nfft) energy = numpy.sum(pspec, 1) # this stores the total energy in each frame energy = numpy.where(energy == 0, numpy.finfo(float).eps, energy) # if energy is zero, we get problems with log fb = get_filterbanksVTLP(nfilt, nfft, samplerate, lowfreq, highfreq, alpha) feat = numpy.dot(pspec, fb.T) # compute the filterbank energies feat = numpy.where(feat == 0, numpy.finfo(float).eps, feat) # if feat is zero, we get problems with log if appendEnergy: feat = numpy.c_[feat, numpy.log( energy )] # replace first cepstral coefficient with log of frame energy return feat, energy
def get_fft_spectrum(signal): signal *= 2**15 # get FFT spectrum signal = remove_dc_and_dither(signal, c.SAMPLE_RATE) signal = sigproc.preemphasis(signal, coeff=c.PREEMPHASIS_ALPHA) frames = sigproc.framesig(signal, frame_len=c.FRAME_LEN * c.SAMPLE_RATE, frame_step=c.FRAME_STEP * c.SAMPLE_RATE, winfunc=np.hamming) fft = abs(np.fft.fft(frames, n=c.NUM_FFT)) fft_norm = normalize_frames(fft.T) rsize = 500 rstart = int((fft_norm.shape[1] - rsize) / 2) out = fft_norm[:, rstart:rstart + rsize] return out
def fbank(signal, samplerate, conf): ''' Compute fbank features from an audio signal. 从一个声音信号中计算fbank特征向量 Args: 参数: signal: the audio signal from which to compute features. Should be an N*1 array 要计算特征的声音信号,一个N*1维的数组 samplerate: the samplerate of the signal we are working with. 要处理信号的采样率 conf: feature configuration 特征的配置 Returns: 返回值: A numpy array of size (NUMFRAMES by nfilt) containing features, a numpy vector containing the signal energy 返回一个包含特征向量的numpy数组,一个包含信号能量的numpy向量 ''' highfreq = int(conf['highfreq']) if highfreq < 0: highfreq = samplerate/2 signal = sigproc.preemphasis(signal, float(conf['preemph'])) frames = sigproc.framesig(signal, float(conf['winlen'])*samplerate, float(conf['winstep'])*samplerate) pspec = sigproc.powspec(frames, int(conf['nfft'])) # this stores the total energy in each frame energy = numpy.sum(pspec, 1) # if energy is zero, we get problems with log energy = numpy.where(energy == 0, numpy.finfo(float).eps, energy) filterbank = get_filterbanks(int(conf['nfilt']), int(conf['nfft']), samplerate, int(conf['lowfreq']), highfreq) # compute the filterbank energies feat = numpy.dot(pspec, filterbank.T) # if feat is zero, we get problems with log feat = numpy.where(feat == 0, numpy.finfo(float).eps, feat) return feat, energy
def get_fft_spectrum(filename, buckets): signal = load_wav(filename,c.SAMPLE_RATE) signal *= 2**15 # get FFT spectrum signal = remove_dc_and_dither(signal, c.SAMPLE_RATE) signal = sigproc.preemphasis(signal, coeff=c.PREEMPHASIS_ALPHA) frames = sigproc.framesig(signal, frame_len=c.FRAME_LEN*c.SAMPLE_RATE, frame_step=c.FRAME_STEP*c.SAMPLE_RATE, winfunc=np.hamming) fft = abs(np.fft.fft(frames,n=c.NUM_FFT)) fft_norm = normalize_frames(fft.T) # truncate to max bucket sizes rsize = max(k for k in buckets if k <= fft_norm.shape[1]) rstart = int((fft_norm.shape[1]-rsize)/2) out = fft_norm[:,rstart:rstart+rsize] # print(out.shape) # exit(0) return out
def fbank(signal, samplerate, conf): """ Compute fbank features from an audio signal. Args: signal: the audio signal from which to compute features. Should be an N*1 array samplerate: the samplerate of the signal we are working with. conf: feature configuration Returns: A numpy array of size (NUMFRAMES by nfilt) containing features, a numpy vector containing the signal energy """ raise BaseException('Not yet implemented') highfreq = int(conf['highfreq']) if highfreq < 0: highfreq = samplerate / 2 signal = sigproc.preemphasis(signal, float(conf['preemph'])) frames = sigproc.framesig(signal, float(conf['winlen']) * samplerate, float(conf['winstep']) * samplerate) pspec = sigproc.powspec(frames, int(conf['nfft'])) # this stores the total energy in each frame energy = numpy.sum(pspec, 1) # if energy is zero, we get problems with log energy = numpy.where(energy == 0, numpy.finfo(float).eps, energy) filterbank = get_filterbanks(int(conf['nfilt']), int(conf['nfft']), samplerate, int(conf['lowfreq']), highfreq) # compute the filterbank energies feat = numpy.dot(pspec, filterbank.T) # if feat is zero, we get problems with log feat = numpy.where(feat == 0, numpy.finfo(float).eps, feat) return feat, energy
def get_fft_spectrum(filename, buckets): signal = load_wav(filename, 16000) signal *= 2**15 # get FFT spectrum signal = remove_dc_and_dither(signal, 16000) signal = sigproc.preemphasis(signal, coeff=0.97) frames = sigproc.framesig(signal, frame_len=0.025 * 16000, frame_step=0.01 * 16000, winfunc=np.hamming) fft = abs(np.fft.fft(frames, n=512)) fft_norm = normalize_frames(fft.T) # truncate to max bucket sizes rsize = max(k for k in buckets if k <= fft_norm.shape[1]) rstart = int((fft_norm.shape[1] - rsize) / 2) out = fft_norm[:, rstart:rstart + rsize] return out
def ssc(signal, samplerate, conf): ''' Compute ssc features from an audio signal. Args: signal: the audio signal from which to compute features. Should be an N*1 array samplerate: the samplerate of the signal we are working with. conf: feature configuration Returns: A numpy array of size (NUMFRAMES by nfilt) containing features, a numpy vector containing the signal log-energy ''' highfreq = int(conf['highfreq']) if highfreq < 0: highfreq = samplerate / 2 signal = sigproc.preemphasis(signal, float(conf['preemph'])) frames = sigproc.framesig(signal, float(conf['winlen']) * samplerate, float(conf['winstep']) * samplerate) pspec = sigproc.powspec(frames, int(conf['nfft'])) # this stores the total energy in each frame energy = numpy.sum(pspec, 1) # if energy is zero, we get problems with log energy = numpy.where(energy == 0, numpy.finfo(float).eps, energy) filterbank = get_filterbanks(int(conf['nfilt']), int(conf['nfft']), samplerate, int(conf['lowfreq']), highfreq) # compute the filterbank energies feat = numpy.dot(pspec, filterbank.T) tiles = numpy.tile(numpy.linspace(1, samplerate / 2, numpy.size(pspec, 1)), (numpy.size(pspec, 0), 1)) return numpy.dot(pspec * tiles, filterbank.T) / feat, numpy.log(energy)
def frames(signal, samplerate, conf): """ Compute frames from an audio signal. Args: signal: the audio signal from which to compute features. Should be an N*1 array samplerate: the samplerate of the signal we are working with. conf: feature configuration Returns: A numpy array of size (NUMFRAMES by winlen) containing features. Each row holds 1 feature vector """ signal = sigproc.preemphasis(signal, float(conf['preemph'])) winfunc = _get_winfunc(conf['winfunc']) frames = sigproc.framesig(signal, float(conf['winlen']) * samplerate, float(conf['winstep']) * samplerate, winfunc) return frames
glottal_flow = np.concatenate([raw_glottal_flow[segment['start']:segment['stop']] for segment in voiced_segments if segment['is_speech']]) wav_samples = wav_samples / float(pow(2, 15)) # to float assert len(glottal_flow) == len(wav_samples),\ f"Inconsistent length: glottal flow ({len(glottal_flow):d}) / wav samples ({len(wav_samples):d})" # Normalize wav_samples = wav_samples / np.linalg.norm(wav_samples) glottal_flow = glottal_flow / np.linalg.norm(glottal_flow) # Frame # sample_frames = framesig(wav_samples, 0.025 * fs, 0.01 * fs, winfunc=lambda x: np.ones((x,)), stride_trick=True) # flow_frames = framesig(glottal_flow, 0.025 * fs, 0.01 * fs, winfunc=lambda x: np.ones((x,)), stride_trick=True) sample_frames = framesig(wav_samples, 0.5 * fs, 0.5 * fs, winfunc=lambda x: np.ones((x,)), stride_trick=True) flow_frames = framesig(glottal_flow, 0.5 * fs, 0.5 * fs, winfunc=lambda x: np.ones((x,)), stride_trick=True) # Some constants x0 = 0.1 # half glottal width at rest position, cm tau = 1e-3 # time delay for surface wave to travel half glottal height T, 1 ms eta = 1. # nonlinear factor for energy dissipation at large amplitude c = 5000 # air particle velocity, cm/s d = 1.75 # length of vocal folds, cm M = 0.5 # mass, g/cm^2 B = 100 # damping, dyne s/cm^3 # Initial conditions alpha = 0.8 # if > 0.5 delta, stable-like oscillator beta = 0.32 delta = 1. # asymmetry parameter