def run(self): #First make output directories self.setupProcessedDataDirs() #check if processor needs to be run if os.path.exists(os.path.join(self.train_dir, "master_text_file.txt"))\ and os.path.exists(os.path.join(self.test_dir, "master_text_file.txt")): print "No need to run data processor..." return #next check which data folders are present self.data_dirs = self.checkWhichDataFoldersArePresent() if len(self.data_dirs) == 0: print "Something went wrong, no data detected, check data directory.." return #get pairs of (audio_file_name, transcribed_text) print "Figuring out which files need to be processed..." audio_file_text_pairs, will_convert = self.getFileNameTextPairs() print "Using {0} files in total dataset...".format( len(audio_file_text_pairs)) #shuffle pairs shuffle(audio_file_text_pairs) if will_convert: audio_processor = audioprocessor.AudioProcessor(1) for audio_file_name in audio_file_text_pairs: audio_processor.convertAndDeleteFLAC( audio_file_name[0].replace(".wav", ".flac")) #print audio_file_text_pairs[-20:] return audio_file_text_pairs
def _read_audio_and_transcode_label(filename_label): # Need to convert back to string because tf.py_func changed it to a numpy array filename = str(filename_label[0], encoding='UTF-8') label = str(filename_label[1], encoding='UTF-8') audio_processor = audioprocessor.AudioProcessor(max_input_seq_length, signal_processing) audio_decoded, audio_length = audio_processor.process_audio_file(filename) label_transcoded = dataprocessor.DataProcessor.get_str_labels(char_map, label) return np.array(audio_decoded, dtype=np.float32), np.array(audio_length, dtype=np.int32),\ np.array(label_transcoded, dtype=np.int32)
def evaluate_full(self, sess, eval_dataset, input_seq_length, signal_processing, char_map, run_options=None, run_metadata=None): # Create an audio_processor audio_processor = audioprocessor.AudioProcessor(input_seq_length, signal_processing) wer_list = [] cer_list = [] file_number = 0 input_feat_vecs = [] input_feat_vec_lengths = [] labels = [] for file, label, _ in eval_dataset: feat_vec, feat_vec_length = audio_processor.process_audio_file(file) file_number += 1 label_data_length = len(label) if (label_data_length > self.max_target_seq_length) or\ (feat_vec_length > self.max_input_seq_length): logging.warning("Warning - sample too long : %s (input : %d / text : %s)", file, feat_vec_length, label_data_length) else: logging.debug("Processed file %d / %d", file_number, len(eval_dataset)) input_feat_vecs.append(feat_vec) input_feat_vec_lengths.append(feat_vec_length) labels.append(label) # If we reached the last file then pad the lists to obtain a full batch if file_number == len(eval_dataset): for i in range(self.batch_size - len(input_feat_vecs)): input_feat_vecs.append(np.zeros([self.max_input_seq_length, audio_processor.feature_size])) input_feat_vec_lengths.append(0) labels.append("") if len(input_feat_vecs) == self.batch_size: # Run the batch logging.debug("Running a batch") input_feat_vecs = np.swapaxes(input_feat_vecs, 0, 1) predictions = self.process_input(sess, input_feat_vecs, input_feat_vec_lengths, run_options=run_options, run_metadata=run_metadata) for index, prediction in enumerate(predictions): transcribed_text = dataprocessor.DataProcessor.get_labels_str(char_map, prediction) true_label = labels[index] if len(true_label) > 0: nb_words = len(true_label.split()) nb_chars = len(true_label.replace(" ", "")) wer_list.append(self.calculate_wer(transcribed_text, true_label) / float(nb_words)) cer_list.append(self.calculate_cer(transcribed_text, true_label) / float(nb_chars)) # Reset the lists input_feat_vecs = [] input_feat_vec_lengths = [] labels = [] wer = (sum(wer_list) * 100) / float(len(wer_list)) cer = (sum(cer_list) * 100) / float(len(cer_list)) return wer, cer
def main(): prog_params = parse_args() serializer = hyperparams.HyperParameterHandler(prog_params['config_file']) hyper_params = serializer.getHyperParams() audio_processor = audioprocessor.AudioProcessor( hyper_params["max_input_seq_length"], hyper_params["load_save_input_vec"]) if prog_params['train'] is True: train_rnn(hyper_params, prog_params) else: process_file(audio_processor, hyper_params, prog_params['file'])
def main(): prog_params = parse_args() serializer = hyperparams.HyperParameterHandler(prog_params['config_file']) hyper_params = serializer.getHyperParams() audio_processor = audioprocessor.AudioProcessor( hyper_params["max_input_seq_length"], hyper_params["signal_processing"]) # Get the input dimension for the RNN, depend on the chosen signal processing mode hyper_params["input_dim"] = audio_processor.feature_size if prog_params['train'] is True: train_rnn(audio_processor, hyper_params, prog_params) elif prog_params['file'] is not None: process_file(audio_processor, hyper_params, prog_params['file']) elif prog_params['record'] is True: record_and_write(audio_processor, hyper_params)
def main(): prog_params = parse_args() serializer = hyperparams.HyperParameterHandler(prog_params['config_file']) hyper_params = serializer.get_hyper_params() audio_processor = audioprocessor.AudioProcessor( hyper_params["max_input_seq_length"], hyper_params["signal_processing"]) # Get the input dimension for the RNN, depend on the chosen signal processing mode hyper_params["input_dim"] = audio_processor.feature_size speech_reco = SpeechRecognizer(hyper_params["language"]) hyper_params["char_map"] = speech_reco.get_char_map() hyper_params["char_map_length"] = speech_reco.get_char_map_length() if prog_params['start_ps'] is True: start_ps_server(prog_params) if (prog_params['train_acoustic'] is True) or (prog_params['dtrain_acoustic'] is True): if hyper_params["dataset_size_ordering"] in ['True', 'First_run_only']: ordered = True else: ordered = False train_set, test_set = speech_reco.load_acoustic_dataset( hyper_params["training_dataset_dirs"], hyper_params["test_dataset_dirs"], hyper_params["training_filelist_cache"], ordered, hyper_params["train_frac"]) if prog_params['train_acoustic'] is True: train_acoustic_rnn(train_set, test_set, hyper_params, prog_params) else: distributed_train_acoustic_rnn(train_set, test_set, hyper_params, prog_params) elif prog_params['train_language'] is True: train_set, test_set = load_language_dataset(hyper_params) train_language_rnn(train_set, test_set, hyper_params, prog_params) elif prog_params['file'] is not None: process_file(audio_processor, hyper_params, prog_params['file']) elif prog_params['record'] is True: record_and_write(audio_processor, hyper_params) elif prog_params['evaluate'] is True: evaluate(hyper_params) elif prog_params['generate_text'] is True: generate_text(hyper_params)
def run(self): if self.data_type == "Shtooka": audio_file_text_pairs, will_convert = self.getFileNameTextPairs_Shtooka( self.raw_data_path) elif self.data_type == "LibriSpeech": data_dirs = self.checkWhichDataFoldersArePresent() # Check which data folders are present if len(data_dirs) == 0: raise Exception( "ERROR : something went wrong, no data detected, check data directory." ) # Get pairs of (audio_file_name, transcribed_text) audio_file_text_pairs, will_convert = self.getFileNameTextPairs_LibriSpeech( data_dirs) else: raise Exception("ERROR : unknown training_dataset_type") # Check that there is data if len(audio_file_text_pairs) == 0: raise Exception("ERROR : no data found in directory {0}".format( self.raw_data_path)) # Shuffle pairs shuffle(audio_file_text_pairs) if will_convert: audio_file_text_pairs_final = [] audio_processor = audioprocessor.AudioProcessor(1) for audio_file_name in audio_file_text_pairs: if audio_file_name[0].endswith(".flac"): audio_processor.convertAndDeleteFLAC(audio_file_name[0]) audio_file_text_pairs_final.append( (audio_file_name[0].replace(".flac", ".wav"), audio_file_name[1])) else: audio_file_text_pairs_final.append( (audio_file_name[0], audio_file_name[1])) else: audio_file_text_pairs_final = audio_file_text_pairs return audio_file_text_pairs_final
def initializeAudioProcessor(self, max_input_seq_length, load_save_input_vec): self.audio_processor = audioprocessor.AudioProcessor(max_input_seq_length, load_save_input_vec)
def initializeAudioProcessor(self, max_input_seq_length): self.audio_processor = audioprocessor.AudioProcessor( max_input_seq_length)
def setUp(self): self.audio_processor = audioprocessor.AudioProcessor(1000) # Create a temp dir for testing purpose cwd = os.getcwd() self.directory = cwd + "/test_directory/" if not os.path.exists(self.directory): os.makedirs(self.directory) else: # Test self.directory already exist, throw an error raise Exception('test_directory already exists') # Setup LibriSpeech files os.makedirs(self.directory + "Libri/") os.makedirs(self.directory + "Libri/train-clean-100/") os.makedirs(self.directory + "Libri/train-clean-100/" + "19/") os.makedirs(self.directory + "Libri/train-clean-100/" + "19/" + "198/") text_file = self.directory + "Libri/train-clean-100/19/198/19-198.trans.txt" with open(text_file, "w") as f: f.write("19-198-0000 NORTHANGER ABBEY\n") f.write("19-198-0001 THIS LITTLE WORK...\n") f.write("19-198-0002 NEITHER THE...\n") # Create empty audio files open(self.directory + "Libri/train-clean-100/19/198/19-198-0000.flac", 'a').close() open(self.directory + "Libri/train-clean-100/19/198/19-198-0001.flac", 'a').close() # Setup Shtooka files os.makedirs(self.directory + "Shtooka/") os.makedirs(self.directory + "Shtooka/flac/") text_file = self.directory + "Shtooka/flac/index.tags.txt" with open(text_file, "w") as f: f.write("\Swac_Index_Tags\n\n") f.write("[GLOBAL]\n") f.write("SWAC_LANG = eng\n") f.write("SWAC_SPEAK_LANG = eng\n\n") f.write("[eng - I_arose.flac]\n") f.write("SWAC_TEXT = I arose\n") f.write("SWAC_ALPHAIDX = arise\n") f.write("SWAC_BASEFORM = arise\n") f.write("SWAC_FORM_NAME = Simple Past\n\n") f.write("[eng - I_ate.flac]\n") f.write("SWAC_TEXT = I ate\n") f.write("SWAC_ALPHAIDX = eat\n") f.write("SWAC_BASEFORM = eat\n") f.write("SWAC_FORM_NAME = Simple Past\n\n") f.write("[eng - I_awoke.flac]\n") f.write("SWAC_TEXT=I awoke\n") f.write("SWAC_ALPHAIDX=awake\n") f.write("SWAC_BASEFORM=awake\n") f.write("SWAC_FORM_NAME=Simple Past\n") # Create empty audio files open(self.directory + "Shtooka/flac/eng - I_arose.flac", 'a').close() open(self.directory + "Shtooka/flac/eng - I_ate.flac", 'a').close() # Setup Vystadial files os.makedirs(self.directory + "Vystadial_2013/") os.makedirs(self.directory + "Vystadial_2013/data_voip_en/") os.makedirs(self.directory + "Vystadial_2013/data_voip_en/dev/") text_file = self.directory + "Vystadial_2013/data_voip_en/dev/jurcic-028-121024_234433_0013625_0013836.wav.trn" with open(text_file, "w") as f: f.write("ALRIGHT THANK YOU AND GOODBYE\n") text_file = self.directory + "Vystadial_2013/data_voip_en/dev/jurcic-028-121015_000550_0026689_0027040.wav.trn" with open(text_file, "w") as f: f.write("FILE WITH NO AUDIO...\n") # Create empty audio file open(self.directory + "Vystadial_2013/data_voip_en/dev/jurcic-028-121024_234433_0013625_0013836.wav", 'a').close() # Setup TEDLIUM files os.makedirs(self.directory + "TEDLIUM/") os.makedirs(self.directory + "TEDLIUM/test/") os.makedirs(self.directory + "TEDLIUM/test/stm/") text_file = self.directory + "TEDLIUM/test/stm/AimeeMullins_2009P.stm" with open(text_file, "w") as f: f.write("AimeeMullins_2009P 1 inter_segment_gap 0 17.82 <o,,unknown> ignore_time_segment_in_scoring\n") f.write("AimeeMullins_2009P 1 AimeeMullins 17.82 28.81 <o,f0,female> i 'd like to share ...\n") # Create empty audio file os.makedirs(self.directory + "TEDLIUM/test/sph/") open(self.directory + "TEDLIUM/test/sph/AimeeMullins_2009P.sph", 'a').close()
def main(): all_params,prog_params = parse_args() serializer = hyperparams.HyperParameterHandler(prog_params['config_file'],checkpoint_dir=prog_params['train_dir'],program_params=all_params) hyper_params = serializer.get_hyper_params() audio_processor = audioprocessor.AudioProcessor(hyper_params["max_input_seq_length"], hyper_params["signal_processing"]) # Get the input dimension for the RNN, depend on the chosen signal processing mode hyper_params["input_dim"] = audio_processor.feature_size speech_reco = SpeechRecognizer(hyper_params["language"]) hyper_params["char_map"] = speech_reco.get_char_map() hyper_params["char_map_length"] = speech_reco.get_char_map_length() if prog_params['start_ps'] is True: start_ps_server(prog_params) if prog_params['save_acoustic'] is True: if hyper_params["dataset_size_ordering"] in ['True', 'First_run_only']: ordered = True else: ordered = False train_set, test_set = speech_reco.load_acoustic_dataset(hyper_params["training_dataset_dirs"], hyper_params["test_dataset_dirs"], hyper_params["training_filelist_cache"], ordered, hyper_params["train_frac"]) logging.info("Save datasets...") save_acoustic_rnn(train_set,"train",hyper_params, prog_params) save_acoustic_rnn(test_set,"test",hyper_params, prog_params) kl = client.Client() kl.datasets.push(os.environ.get('WORKSPACE_NAME'),'librispeech-dev','1.0.'+os.environ.get('BUILD_ID')+'-tfrecords',prog_params["train_dir"],create=True) elif (prog_params['train_acoustic'] is True) or (prog_params['dtrain_acoustic'] is True): if hyper_params["dataset_size_ordering"] in ['True', 'First_run_only']: ordered = True else: ordered = False train_set = None test_set = None if prog_params['train_set'] is not None: train_set = prog_params['train_set'] test_set = prog_params['test_set'] else: train_set, test_set = speech_reco.load_acoustic_dataset(hyper_params["training_dataset_dirs"], hyper_params["test_dataset_dirs"], hyper_params["training_filelist_cache"], ordered, hyper_params["train_frac"]) if prog_params['train_acoustic'] is True: train_acoustic_rnn(train_set, test_set, hyper_params, prog_params) else: distributed_train_acoustic_rnn(train_set, test_set, hyper_params, prog_params) elif prog_params['train_language'] is True: train_set, test_set = load_language_dataset(hyper_params) train_language_rnn(train_set, test_set, hyper_params, prog_params) elif prog_params['file'] is not None: process_file(audio_processor, hyper_params, prog_params['file']) elif prog_params['record'] is True: record_and_write(audio_processor, hyper_params) elif prog_params['evaluate'] is True: evaluate(hyper_params) elif prog_params['generate_text'] is True: generate_text(hyper_params)