class DataManager: def __init__(self): self.__INPUT_SAMPLING_RATE = int(11025) self.__N_SAMPLES_WINDOW = int(1024) self.__N_SAMPLES_OVERLAP = int(0.5 * self.__N_SAMPLES_WINDOW) self.__WINDOW = 'hann' self.__CHROME_DRIVER_PATH = r"resources/chromedriver" self.__db = DBManager() self.__audio_manager = AudioBooksManager(self.__db, self.__CHROME_DRIVER_PATH) self.__noise_manager = NoiseManager(self.__db) def main(self, filename='', mode='', download=0, noises=[], limit=0): try: if download: logging.info('Downloading audio books for training model') self.__audio_manager.downloadData() logging.info('Downloading noise audios for training model') self.__noise_manager.downloadData() logging.info('Retrieving audio-noise combinations') file_combinations = self.__db.modelTrainGetCombination( self.__INPUT_SAMPLING_RATE, noises, limit) with File(filename, mode) as f: logging.info( 'Creating group for SPS:%d and FFT:%d' % (self.__INPUT_SAMPLING_RATE, self.__N_SAMPLES_WINDOW)) main_group = f.create_group( np.string_( 'SPS%dFFT%d' % (self.__INPUT_SAMPLING_RATE, self.__N_SAMPLES_WINDOW))) main_group.attrs.create(np.string_('SAMPLE_RATE'), np.string_(self.__INPUT_SAMPLING_RATE)) main_group.attrs.create(np.string_('FFT_SIZE'), np.string_(self.__N_SAMPLES_WINDOW)) for idx, file_combination in enumerate(file_combinations): try: logging.info('Loading data') clean_info = self.__db.audioBookGetById( file_combination[1]) clean = self.load_audio(clean_info[0][9], normalized=False) if idx > 0: if file_combination[2] != file_combinations[idx - 1][2]: noise_info = self.__db.noiseGetById( file_combination[2]) noise = self.load_audio(noise_info[0][3], normalized=False) else: noise_info = self.__db.noiseGetById( file_combination[2]) noise = self.load_audio(noise_info[0][3], normalized=False) if clean.duration_seconds > noise.duration_seconds: logging.info( 'Clipping clean audio to fit noise audio duration' ) clean = clean[:noise.duration_seconds] logging.info('Overlaying noise and clean audios') dirty = clean.overlay(noise) clean_samples = np.array(clean.get_array_of_samples(), dtype=np.float32) clean_sampling_rate = clean.frame_rate dirty_samples = np.array(dirty.get_array_of_samples(), dtype=np.float32) dirty_sampling_rate = dirty.frame_rate logging.info('Processing data') dirty_freq, dirty_time, dirty_db, dirty_phase = self.__prepateInput( dirty_samples, dirty_sampling_rate) clean_freq, clean_time, clean_db, clean_phase = self.__prepateInput( clean_samples, clean_sampling_rate) logging.info('Storing data') self.__store_h5_data(main_group, file_combination, clean_info[0], noise_info[0], clean_freq, clean_time, clean_db, clean_phase, dirty_freq, dirty_time, dirty_db, dirty_phase) except ResamplingError as e: logging.warning(str(e), exc_info=True) except Exception as e: logging.error(str(e), exc_info=True) raise def __resample(self, input_signal, input_sampling_rate): if input_sampling_rate % self.__INPUT_SAMPLING_RATE: raise ResamplingError( 'Downsampling factor is not integer number\n' '\tInput sampling rate: %d\n' % input_sampling_rate + '\tTarget sampling rate: %d\n' % self.__INPUT_SAMPLING_RATE) factor = input_sampling_rate / self.__INPUT_SAMPLING_RATE logger.info( 'Input sampling rate is different from the expected by the model.\n' + '\rInput sampling rate: ' + str(input_sampling_rate) + '\n' + '\rModel sampling rate: ' + str(self.__INPUT_SAMPLING_RATE) + '\n' + 'Resampling input signal by factor: ' + str(factor)) in_signal = decimate(input_signal, int(factor)) return in_signal def __prepateInput(self, input_signal, sampling_rate): if sampling_rate != self.__INPUT_SAMPLING_RATE: input_signal = self.__resample(input_signal, sampling_rate) freq, time, stft = spectrogram( input_signal, fs=self.__INPUT_SAMPLING_RATE, window=get_window(self.__WINDOW, self.__N_SAMPLES_WINDOW), # nperseg=None, noverlap=self.__N_SAMPLES_OVERLAP, nfft=self.__N_SAMPLES_WINDOW, # detrend='constant', return_onesided=True, scaling='spectrum', axis=-1, mode='complex') db_values = amplitude_to_db(np.abs(stft)) db_values = np.transpose(db_values)[:, np.newaxis, :] phase = np.angle(stft) return [freq, time, db_values, phase] def __store_h5_data(self, main_group, file_combination, clean_info, noise_info, clean_freq, clean_time, clean_db, clean_phase, dirty_freq, dirty_time, dirty_db, dirty_phase): combination_group = main_group.create_group( np.string_('COMBINATION@ID_%d' % file_combination[0])) combination_group.attrs.create(np.string_('COMBINATION@ID'), np.int32(file_combination[0])) combination_group.attrs.create(np.string_('COMBINATION@SAMPLE_RATE'), np.float64(self.__INPUT_SAMPLING_RATE)) combination_group.attrs.create(np.string_('CLEAN@ID'), np.int32(clean_info[0])) combination_group.attrs.create(np.string_('CLEAN@BOOK_DUMMY_NAME'), np.string_(clean_info[1])) combination_group.attrs.create(np.string_('CLEAN@BOOK_NAME'), clean_info[2]) combination_group.attrs.create(np.string_('CLEAN@BOOK_AUTHOR'), clean_info[3]) combination_group.attrs.create(np.string_('CLEAN@BOOK_URL'), np.string_(clean_info[4])) combination_group.attrs.create(np.string_('CLEAN@BOOK_LANGUAGE'), clean_info[5]) combination_group.attrs.create(np.string_('CLEAN@BOOK_N_TRACK'), np.int32(clean_info[7])) combination_group.attrs.create(np.string_('CLEAN@TRACK_NAME'), np.string_(clean_info[8])) combination_group.attrs.create(np.string_('CLEAN@TRACK_SAMPLE_RATE'), np.float64(clean_info[11])) combination_group.attrs.create(np.string_('NOISE@ID'), np.int32(noise_info[0])) combination_group.attrs.create(np.string_('NOISE@NAME'), noise_info[1]) combination_group.attrs.create(np.string_('NOISE@URL'), np.string_(noise_info[2])) combination_group.attrs.create(np.string_('NOISE@ORIGINAL_N_CHANNEL'), np.int8(noise_info[4])) combination_group.attrs.create( np.string_('NOISE@ORIGINAL_SAMPLE_RATE'), np.float64(noise_info[5])) clean_group = combination_group.create_group(r'CLEAN') clean_group.create_dataset('FREQ', data=clean_freq) clean_group.create_dataset('TIME', data=clean_time) clean_group.create_dataset('DB', data=clean_db) clean_group.create_dataset('PHASE', data=clean_phase) clean_group.attrs.create(np.string_('FFT@SIZE'), np.int32(self.__N_SAMPLES_WINDOW)) clean_group.attrs.create(np.string_('FFT@N_SAMPLES_OVERLAP'), np.int32(self.__N_SAMPLES_OVERLAP)) clean_group.attrs.create(np.string_('FFT@WINDOW'), np.string_(self.__WINDOW)) dirty_group = combination_group.create_group(r'DIRTY') dirty_group.create_dataset('FREQ', data=dirty_freq) dirty_group.create_dataset('TIME', data=dirty_time) dirty_group.create_dataset('DB', data=dirty_db) dirty_group.create_dataset('PHASE', data=dirty_phase) dirty_group.attrs.create(np.string_('FFT@SIZE'), np.int32(self.__N_SAMPLES_WINDOW)) dirty_group.attrs.create(np.string_('FFT@N_SAMPLES_OVERLAP'), np.int32(self.__N_SAMPLES_OVERLAP)) dirty_group.attrs.create(np.string_('FFT@WINDOW'), np.string_(self.__WINDOW)) @staticmethod def load_audio(path, normalized=True): ext = os.path.splitext(path)[1][1:] logging.info('Loading audio ' + path + ' with file type ' + ext) rawSound = AudioSegment.from_file(path, ext) if rawSound.channels != 1: logging.info( 'Audio contains more than one channel. Setting to single channel' ) rawSound = rawSound.set_channels(1) if normalized: logging.info('Normalize audio') return effects.normalize(rawSound) else: return rawSound