def _add_noise(signal, noise_file_name, snr, sample_rate): """ :param signal: :param noise_file_name: :param snr: :return: """ # Open noise file noise, fs_noise = read_audio(noise_file_name, sample_rate) logging.info("Noise.shape = {}".format(noise.shape)) logging.info("signal.shape = {}".format(signal.shape)) # Generate random section of masker if len(noise) < len(signal): dup_factor = len(signal) // len(noise) + 1 noise = numpy.tile(noise, dup_factor) if len(noise) != len(signal): idx = numpy.random.randint(0, len(noise) - len(signal)) noise = noise[idx:idx + len(signal)] # Compute energy of both signals N_dB = _rms_energy(noise) S_dB = _rms_energy(signal) # Rescale N N_new = S_dB - snr noise_scaled = 10 ** (N_new / 20) * noise / 10 ** (N_dB / 20) noisy = signal + noise_scaled return (noisy - noisy.mean()) / noisy.std()
def _add_reverb(signal, reverb_file_name, sample_rate, reverb_level=-26.0, ): '''Adds reverb (convolutive noise) to a speech signal. The output speech level is normalized to asl_level. ''' reverb, _ = read_audio(reverb_file_name, sample_rate) y = lfilter(reverb, 1, signal) y = y/10**(asl_meter(y, sample_rate)/20) * 10**(reverb_level/20) return (y - y.mean()) / y.std()
def extract(self, *file): show, channel, input_audio_filename, output_feature_filename, extra = file[ 0] backing_store = True if input_audio_filename is not None: self.audio_filename_structure = input_audio_filename audio_filename = self.audio_filename_structure.format(show) # If the output file name does not include the ID of the show, # (i.e., if the feature_filename_structure does not include {}) # the feature_filename_structure is updated to use the output_feature_filename if output_feature_filename is not None: self.feature_filename_structure = output_feature_filename if extra: feature_filename = self.feature_filename_structure.format( show, extra) else: feature_filename = self.feature_filename_structure.format(show) # if os.path.exists(feature_filename): # return # Open audio file, get the signal and possibly the sampling frequency signal, sample_rate = read_audio(audio_filename, self.sampling_frequency) if signal.ndim == 1: signal = signal[:, np.newaxis] # Process the target channel to return Filter-Banks, Cepstral coefficients and BNF if required length, chan = signal.shape # If the size of the signal is not enough for one frame, return zero features PARAM_TYPE = np.float32 if length < self.window_sample: cep = np.empty((0, self.ceps_number), dtype=PARAM_TYPE) energy = np.empty((0, 1), dtype=PARAM_TYPE) fb = np.empty((0, self.filter_bank_size), dtype=PARAM_TYPE) label = np.empty((0, 1), dtype='int8') else: # Random noise is added to the input signal to avoid zero frames. np.random.seed(0) signal[:, channel] += 0.0001 * np.random.randn(signal.shape[0]) dec = self.shift_sample * 250 * 25000 + self.window_sample dec2 = self.window_sample - self.shift_sample start = 0 end = min(dec, length) # Process the signal by batch to avoid problems for very long signals while start < (length - dec2): if self.feature_type == 'mfcc': # Extract cepstral coefficients, energy and filter banks cep, energy, _, fb = mfcc(signal[start:end, channel], fs=self.sampling_frequency, lowfreq=self.lower_frequency, maxfreq=self.higher_frequency, nlinfilt=self.filter_bank_size if self.filter_bank == "lin" else 0, nlogfilt=self.filter_bank_size if self.filter_bank == "log" else 0, nwin=self.window_size, shift=self.shift, nceps=self.ceps_number, get_spec=False, get_mspec=True, prefac=self.pre_emphasis) elif self.feature_type == 'plp': cep, energy, _, fb = plp(signal[start:end, channel], nwin=self.window_size, fs=self.sampling_frequency, plp_order=self.ceps_number, shift=self.shift, get_spec=False, get_mspec=True, prefac=self.pre_emphasis, rasta=self.rasta_plp) # Perform feature selection label, threshold = self._vad(cep, energy, fb, signal[start:end, channel]) # print(len(label[label])) if len(label) < len(energy): label = np.hstack( (label, np.zeros(len(energy) - len(label), dtype='bool'))) start = end - dec2 end = min(end + dec, length) # Create the HDF5 file # Create the directory if it dosn't exist dir_name = os.path.dirname(feature_filename) # get the path if not os.path.exists(dir_name) and (dir_name is not ''): os.makedirs(dir_name) h5f = h5py.File(feature_filename, 'a', backing_store=backing_store, driver='core') if "cep" not in self.save_param: cep = None # cep_mean = None # cep_std = None if "energy" not in self.save_param: energy = None # energy_mean = None # energy_std = None if "fb" not in self.save_param: fb = None if "vad" not in self.save_param: label = None cep, fb, label = self.postProc(cep, energy, fb, label) write_hdf5(show, h5f, cep, None, None, None, None, None, fb, None, None, None, None, None, label) h5f.close() pass
def extract(self, show, channel, input_audio_filename=None, output_feature_filename=None, backing_store=False): """ Compute the acoustic parameters (filter banks, cepstral coefficients, log-energy and bottleneck features for a single channel from a given audio file. :param show: ID if the show :param channel: channel number (0 if mono file) :param input_audio_filename: name of the input audio file to consider if the name of the audio file is independent from the ID of the show :param output_feature_filename: name of the output feature file to consider if the name of the feature file is independent from the ID of the show :param backing_store: boolean, if False, nothing is writen to disk, if True, the file is writen to disk when closed :param feature_type: can be mfcc or plp :param rasta: boolean, only for PLP parameters, if True, perform RASTA filtering :return: an hdf5 file handler """ # Create the filename to load # If the input audio file name does not include the ID of the show # (i.e., if the audio_filename_structure does not include {}) # the audio_filename_structure is updated to use the input_audio_filename if input_audio_filename is not None: self.audio_filename_structure = input_audio_filename audio_filename = self.audio_filename_structure.format(show) # If the output file name does not include the ID of the show, # (i.e., if the feature_filename_structure does not include {}) # the feature_filename_structure is updated to use the output_feature_filename if output_feature_filename is not None: self.feature_filename_structure = output_feature_filename feature_filename = self.feature_filename_structure.format(show) # Open audio file, get the signal and possibly the sampling frequency signal, sample_rate = read_audio(audio_filename, self.sampling_frequency) if signal.ndim == 1: signal = signal[:, numpy.newaxis] # Process the target channel to return Filter-Banks, Cepstral coefficients and BNF if required length, chan = signal.shape # If the size of the signal is not enough for one frame, return zero features if length < self.window_sample: cep = numpy.empty((0, self.ceps_number), dtype=PARAM_TYPE) energy = numpy.empty((0, 1), dtype=PARAM_TYPE) fb = numpy.empty((0, self.filter_bank_size), dtype=PARAM_TYPE) label = numpy.empty((0, 1), dtype='int8') else: # Random noise is added to the input signal to avoid zero frames. numpy.random.seed(0) signal[:, channel] += 0.0001 * numpy.random.randn(signal.shape[0]) dec = self.shift_sample * 250 * 25000 + self.window_sample dec2 = self.window_sample - self.shift_sample start = 0 end = min(dec, length) # Process the signal by batch to avoid problems for very long signals while start < (length - dec2): logging.info('process part : %f %f %f', start / self.sampling_frequency, end / self.sampling_frequency, length / self.sampling_frequency) if self.feature_type == 'mfcc': # Extract cepstral coefficients, energy and filter banks cep, energy, _, fb = mfcc(signal[start:end, channel], fs=self.sampling_frequency, lowfreq=self.lower_frequency, maxfreq=self.higher_frequency, nlinfilt=self.filter_bank_size if self.filter_bank == "lin" else 0, nlogfilt=self.filter_bank_size if self.filter_bank == "log" else 0, nwin=self.window_size, shift=self.shift, nceps=self.ceps_number, get_spec=False, get_mspec=True, prefac=self.pre_emphasis) elif self.feature_type == 'plp': cep, energy, _, fb = plp(signal[start:end, channel], nwin=self.window_size, fs=self.sampling_frequency, plp_order=self.ceps_number, shift=self.shift, get_spec=False, get_mspec=True, prefac=self.pre_emphasis, rasta=self.rasta_plp) # Perform feature selection label, threshold = self._vad(cep, energy, fb, signal[start:end, channel]) if len(label) < len(energy): label = numpy.hstack((label, numpy.zeros(len(energy)-len(label), dtype='bool'))) start = end - dec2 end = min(end + dec, length) if cep.shape[0] > 0: logging.info('!! size of signal cep: %f len %d type size %d', cep[-1].nbytes/1024/1024, len(cep[-1]), cep[-1].nbytes/len(cep[-1])) # Compute the lean and std of fb and cepstral coefficient comuted for all selected frames energy_mean = energy[label].mean(axis=0) energy_std = energy[label].std(axis=0) fb_mean = fb[label, :].mean(axis=0) fb_std = fb[label, :].std(axis=0) cep_mean = cep[label, :].mean(axis=0) cep_std = cep[label, :].std(axis=0) # bnf_mean = bnf[label, :].mean(axis=0) # bnf_std = bnf[label, :].std(axis=0) # Create the HDF5 file # Create the directory if it dosn't exist dir_name = os.path.dirname(feature_filename) # get the path if not os.path.exists(dir_name) and (dir_name is not ''): os.makedirs(dir_name) h5f = h5py.File(feature_filename, 'a', backing_store=backing_store, driver='core') if "cep" not in self.save_param: cep = None cep_mean = None cep_std = None if "energy" not in self.save_param: energy = None energy_mean = None energy_std = None if "fb" not in self.save_param: fb = None fb_mean = None fb_std = None if "bnf" not in self.save_param: bnf = None bnf_mean = None bnf_std = None if "vad" not in self.save_param: label = None logging.info(label) write_hdf5(show, h5f, cep, cep_mean, cep_std, energy, energy_mean, energy_std, fb, fb_mean, fb_std, bnf, bnf_mean, bnf_std, label) return h5f