def extract(self, src_file, tgt_file): ''' Feature extraction ''' # Read this audio file y_t = ia.read(src_file, in_fs=self.config['in_fs'], out_fs=self.config['work_fs'])[0] # # BEAMFORMING # # Beamformer poiting at the fromtal direction by default if len(y_t.shape) > 1 and y_t.shape[1] > 1: y_t = y_t.sum(1) # # SPEECH ENHANCEMENT # # Pre-emphasis, STFT y_t = sip.preemphasis(y_t, coef=self.config['preemcoef']) Y = sip.stft(y_t, self.config['windowsize'], self.config['shift'], self.config['nfft']) # Compute IMCRA hat_X_LSA = self.se.update(Y) if (self.config['mmse_method'] == 'MFCC' or self.config['mmse_method'] == 'Wiener'): # Get a priori SNR and noise variance xi, Lambda_D = self.se.get_param(['xi', 'Lambda_D']) # Get Wiener estimate and residual MSE G = xi / (1 + xi) # Use the posterior associated to the Wiener filter hat_X = G * Y if self.config['mmse_method'] == 'MFCC': MSE = G * Lambda_D else: MSE = np.zeros(G.shape) elif self.config['mmse_method'] == 'LSA': # Use LSA hat_X = hat_X_LSA MSE = np.zeros(hat_X.shape) # # FEATURE EXTRACTION / UNCERTAINTY PROPAGATION # # MFCC mu_x, Sigma_x = self.mfcc.extract_up(hat_X, MSE) # CMS mu_x, Sigma_x = self.mfcc.cms_up(mu_x, Sigma_x) # Deltas, Accelerations mu_d, Sigma_d = fe.deltas_up(mu_x, Sigma_x) mu_a, Sigma_a = fe.deltas_up(mu_d, Sigma_d) mu_x = np.concatenate((mu_x, mu_d, mu_a)) Sigma_x = np.concatenate((Sigma_x, Sigma_d, Sigma_a)) # Provide uncertainty if self.config['do_up']: x = np.concatenate((mu_x, Sigma_x)) else: x = mu_x # Plot features, for debug if self.config['targetformat'] == 'HTK': htk.writehtkfeats(tgt_file, x, self.config['fp'], self.config['tc']) else: raise ValueError, ("TARGETFORMAT = %s Not supported" % self.config['targetformat'])
def extract(self, src_file, tgt_file): ''' Feature extraction ''' # Read this audio file y_t = ia.read(src_file, in_fs=self.config['in_fs'], out_fs=self.config['work_fs'])[0] # # BEAMFORMING # # Beamformer poiting at the fromtal direction by default if len(y_t.shape) > 1 and y_t.shape[1] > 1: y_t = y_t.sum(1) # # SPEECH ENHANCEMENT # # Pre-emphasis, STFT y_t = sip.preemphasis(y_t, coef=self.config['preemcoef']) Y = sip.stft(y_t, self.config['windowsize'], self.config['shift'], self.config['nfft']) # Compute IMCRA hat_X_LSA = self.se.update(Y) if (self.config['mmse_method'] == 'MFCC' or self.config['mmse_method'] == 'Wiener'): # Get a priori SNR and noise variance xi, Lambda_D = self.se.get_param(['xi', 'Lambda_D']) # Get Wiener estimate and residual MSE G = xi/(1+xi) # Use the posterior associated to the Wiener filter hat_X = G*Y if self.config['mmse_method'] == 'MFCC': MSE = G*Lambda_D else: MSE = np.zeros(G.shape) elif self.config['mmse_method'] == 'LSA': # Use LSA hat_X = hat_X_LSA MSE = np.zeros(hat_X.shape) # # FEATURE EXTRACTION / UNCERTAINTY PROPAGATION # # MFCC mu_x, Sigma_x = self.mfcc.extract_up(hat_X, MSE) # CMS mu_x, Sigma_x = self.mfcc.cms_up(mu_x, Sigma_x) # Deltas, Accelerations mu_d, Sigma_d = fe.deltas_up(mu_x, Sigma_x) mu_a, Sigma_a = fe.deltas_up(mu_d, Sigma_d) mu_x = np.concatenate((mu_x, mu_d, mu_a)) Sigma_x = np.concatenate((Sigma_x, Sigma_d, Sigma_a)) # Provide uncertainty if self.config['do_up']: x = np.concatenate((mu_x, Sigma_x)) else: x = mu_x # Plot features, for debug if self.config['targetformat'] == 'HTK': htk.writehtkfeats(tgt_file, x, self.config['fp'], self.config['tc']) else: raise ValueError, ("TARGETFORMAT = %s Not supported" % self.config['targetformat'])
def extract(self, src_file, tgt_file): ''' Feature extraction ''' # Get indices for the position of speech and background based on # external info. If MLF or STM provided for VAD use them. if 'stm_vad' in self.config: # VAD SPECIFIED BY A STM if src_file not in self.config['stm_trans']: raise EnvironmentError, ("stm file %s has not transcription " "for %s" % (self.config['stm_vad'], src_file)) # Collect speech events and preceeding backgrounds events = [] backgs = [] t_bg = 0 for tr in self.config['stm_trans'][src_file]: # Preceeding background if not tr[2]: backgs.append(None) else: backgs.append((0, tr[2] * self.config['in_fs'])) # Speech event events.append((src_file, tr[2] * self.config['in_fs'], tr[3] * self.config['in_fs'])) else: # ONE SINGLE EVENT IN PRESENT MICROPHONE T = int(self.config['work_fs'] * self.config['init_time']) events = [(src_file, T, -1)] backgs = [(0, T)] # Loop over events in the scene for backg, event in zip(backgs, events): # Read this audio file y_t = ia.read(src_file, in_fs=self.config['in_fs'], out_fs=self.config['work_fs'])[0] # # BEAMFORMING # # Beamformer poiting at the fromtal direction by default if y_t.shape[1] > 1: y_t = y_t.sum(1) # # SPEECH ENHANCEMENT # # Select segment of background preceeding speech if backg: d_t = y_t[backg[0]:backg[1]] else: d_t = None # Select segment of speech y_t = y_t[event[1]:event[2]] # Pre-emphasis, STFT y_t = sip.preemphasis(y_t, coef=self.config['preemcoef']) Y = sip.stft(y_t, self.config['windowsize'], self.config['shift'], self.config['nfft']) # Compute IMCRA self.se.update(Y) # Get a priori SNR and Wiener gain xi = self.se.store['xi'][:, :self.se.l] G = xi / (1 + xi) # Get Wiener estimate and residual MSE hat_X_W = G * Y MSE = G * self.se.store['Lambda_D'][:, :self.se.l] set_trace() # MFCC mu_x, Sigma_x = self.mfcc.extract_up(hat_X_W, MSE) # CMS mu_x, Sigma_x = self.mfcc.cms_up(mu_x, Sigma_x) # Deltas, Accelerations mu_d, Sigma_d = fe.deltas_up(mu_x, Sigma_x) mu_a, Sigma_a = fe.deltas_up(mu_d, Sigma_d) mu_x = np.concatenate((mu_x, mu_d, mu_a)) Sigma_x = np.concatenate((Sigma_x, Sigma_d, Sigma_a)) if self.config['unc_prop']: x = np.concatenate((mu_x, Sigma_x)) else: x = mu_x # Plot features, for debug htk.writehtkfeats(target_file, x, self.config['fp'], self.config['tc'])
def extract(self, src_file, tgt_file): ''' Feature extraction ''' # Get indices for the position of speech and background based on # external info. If MLF or STM provided for VAD use them. if 'stm_vad' in self.config: # VAD SPECIFIED BY A STM if src_file not in self.config['stm_trans']: raise EnvironmentError, ("stm file %s has not transcription " "for %s" % (self.config['stm_vad'], src_file)) # Collect speech events and preceeding backgrounds events = [] backgs = [] t_bg = 0 for tr in self.config['stm_trans'][src_file]: # Preceeding background if not tr[2]: backgs.append(None) else: backgs.append((0, tr[2]*self.config['in_fs'])) # Speech event events.append((src_file, tr[2]*self.config['in_fs'], tr[3]*self.config['in_fs'])) else: # ONE SINGLE EVENT IN PRESENT MICROPHONE T = int(self.config['work_fs']*self.config['init_time']) events = [(src_file, T, -1)] backgs = [(0, T)] # Loop over events in the scene for backg, event in zip(backgs, events): # Read this audio file y_t = ia.read(src_file, in_fs=self.config['in_fs'], out_fs=self.config['work_fs'])[0] # # BEAMFORMING # # Beamformer poiting at the fromtal direction by default if y_t.shape[1] > 1: y_t = y_t.sum(1) # # SPEECH ENHANCEMENT # # Select segment of background preceeding speech if backg: d_t = y_t[backg[0]:backg[1]] else: d_t = None # Select segment of speech y_t = y_t[event[1]:event[2]] # Pre-emphasis, STFT y_t = sip.preemphasis(y_t, coef=self.config['preemcoef']) Y = sip.stft(y_t, self.config['windowsize'], self.config['shift'], self.config['nfft']) # Compute IMCRA self.se.update(Y) # Get a priori SNR and Wiener gain xi = self.se.store['xi'][:, :self.se.l] G = xi/(1+xi) # Get Wiener estimate and residual MSE hat_X_W = G*Y MSE = G*self.se.store['Lambda_D'][:, :self.se.l] set_trace() # MFCC mu_x, Sigma_x = self.mfcc.extract_up(hat_X_W, MSE) # CMS mu_x, Sigma_x = self.mfcc.cms_up(mu_x, Sigma_x) # Deltas, Accelerations mu_d, Sigma_d = fe.deltas_up(mu_x, Sigma_x) mu_a, Sigma_a = fe.deltas_up(mu_d, Sigma_d) mu_x = np.concatenate((mu_x, mu_d, mu_a)) Sigma_x = np.concatenate((Sigma_x, Sigma_d, Sigma_a)) if self.config['unc_prop']: x = np.concatenate((mu_x, Sigma_x)) else: x = mu_x # Plot features, for debug htk.writehtkfeats(target_file, x, self.config['fp'], self.config['tc'])