def recognize(model: tf.keras.Model, audio_feature_type: str, start_sign: str, unk_sign: str, end_sign: str, record_path: str, max_length: int, dict_path: str): """ 语音识别模块 :param model: 模型 :param audio_feature_type: 特征类型 :param start_sign: 开始标记 :param end_sign: 结束标记 :param unk_sign: 未登录词 :param record_path: 录音保存路径 :param max_length: 最大音频补齐长度 :param dict_path: 字典保存路径 :return: 无返回值 """ while True: try: record_duration = int(input("请设定录音时长(秒, 负数结束,0则继续输入音频路径):")) except BaseException: print("录音时长只能为int数值") else: if record_duration < 0: break if not os.path.exists(record_path): os.makedirs(record_path) # 录音 if record_duration == 0: record_path = input("请输入音频路径:") else: record_path = record_path + time.strftime( "%Y_%m_%d_%H_%M_%S_", time.localtime(time.time())) + ".wav" record(record_path, record_duration) # 加载录音数据并预测 audio_feature = wav_to_feature(record_path, audio_feature_type) audio_feature = audio_feature[:max_length, :] input_tensor = tf.keras.preprocessing.sequence.pad_sequences( [audio_feature], padding='post', maxlen=max_length, dtype='float32') predictions = model(input_tensor) ctc_input_length = compute_ctc_input_length( input_tensor.shape[1], predictions.shape[1], tf.convert_to_tensor([[len(audio_feature)]])) output = tf.keras.backend.ctc_decode( y_pred=predictions, input_length=tf.reshape(ctc_input_length, [ctc_input_length.shape[0]]), greedy=True) tokenizer = load_tokenizer(dict_path=dict_path) sentence = tokenizer.sequences_to_texts(output[0][0].numpy()) sentence = sentence[0].replace(start_sign, '').replace(end_sign, '').replace(' ', '') print("Output:", sentence)
def recognize(encoder: tf.keras.Model, decoder: tf.keras.Model, beam_size: int, audio_feature_type: str, max_length: int, max_sentence_length: int, dict_path: str): """ 语音识别模块 :param encoder: 模型的encoder :param decoder: 模型的decoder :param beam_size: beam_size :param audio_feature_type: 特征类型 :param max_length: 最大音频补齐长度 :param max_sentence_length: 最大音频补齐长度 :param dict_path: 字典保存路径 """ beam_search_container = BeamSearch(beam_size=beam_size, max_length=max_sentence_length, worst_score=0) print("Agent: 你好!结束识别请输入ESC。") while True: path = input("Path: ") if path == "ESC": print("Agent: 再见!") exit(0) if not os.path.exists(path): print("音频文件不存在,请重新输入") continue audio_feature = wav_to_feature(path, audio_feature_type) audio_feature = tf.expand_dims(audio_feature, axis=0) audio_feature = tf.keras.preprocessing.sequence.pad_sequences( audio_feature, maxlen=max_length, dtype="float32", padding="post") with open(dict_path, 'r', encoding='utf-8') as dict_file: json_string = dict_file.read().strip().strip("\n") tokenizer = tf.keras.preprocessing.text.tokenizer_from_json( json_string) dec_input = tf.expand_dims( [tokenizer.word_index.get("<start>", "<unk>")], 0) beam_search_container.reset(inputs=audio_feature, dec_input=dec_input) for i in range(max_sentence_length): enc_outputs, padding_mask = encoder(audio_feature) sentence_predictions = decoder( inputs=[dec_input, enc_outputs, padding_mask]) sentence_predictions = tf.nn.softmax(sentence_predictions) sentence_predictions = sentence_predictions[:, -1, :] beam_search_container.expand( predictions=sentence_predictions, end_sign=tokenizer.word_index.get("<end>")) if beam_search_container.beam_size == 0: break audio_feature, dec_input = beam_search_container.get_search_inputs( ) beam_search_result = beam_search_container.get_result(top_k=3) result = '' # 从容器中抽取序列,生成最终结果 for i in range(len(beam_search_result)): temp = beam_search_result[i].numpy() text = tokenizer.sequences_to_texts(temp)[0] text = text.replace("<start>", '').replace("<end>", '').replace(' ', '') result = '<' + text + '>' + result print("识别句子为:{}".format(result)) print("识别结束")
def preprocess_thchs30_speech_raw_data(data_path: str, dataset_infos_file: str, max_time_step: int, spectrum_data_dir: str, max_sentence_length: int, vocab_size: int, audio_feature_type: str = "mfcc", save_length_path: str = "", is_train: bool = True, transcript_row: int = 0, start_sign: str = "<start>", dict_path: str = "", end_sign: str = "<end>", unk_sign: str = "<unk>", max_treat_data_size: int = 0): """ 用于处理thchs30数据集的方法,将数据整理为<音频地址, 句子>的 形式,这样方便后续进行分批读取 :param data_path: 数据存放目录路径 :param dataset_infos_file: 保存处理之后的数据路径 :param max_time_step: 最大音频补齐长度 :param max_sentence_length: 文本序列最大长度 :param vocab_size: 词汇大小 :param spectrum_data_dir: 保存处理后的音频特征数据目录 :param audio_feature_type: 特征类型 :param save_length_path: 保存样本长度文件路径 :param is_train: 处理的是否是训练数据 :param dict_path: 字典路径,若使用phoneme则不用传 :param transcript_row: 使用文本数据中的第几行,第一行文字,第二行拼音,第三行音节 :param start_sign: 句子开始标记 :param end_sign: 句子结束标记 :param unk_sign: 未登录词 :param max_treat_data_size: 最大处理数据,若为0,则全部数据 :return: 无返回值 """ _check_and_create_file(data_path, spectrum_data_dir) count = 0 len_list = [] text_list = [] text_file_path_list = [] data_list = os.listdir(data_path) data_fir = data_path[:data_path.find("30")] + "30\\data\\" with open(dataset_infos_file, 'w', encoding='utf-8') as ds_infos_file: for data_name in data_list: if os.path.splitext(data_name)[1] == ".wav": len_pair = [] # 音频文件 audio_path = data_path + data_name # 对应的文本 text_file_name = data_path + data_name + ".trn" if not os.path.exists(text_file_name): print("{}文本数据不完整,请检查后重试".format(text_file_name)) exit(0) with open(text_file_name, 'r', encoding='utf-8') as text_file: texts = text_file.readlines() text = texts[0].strip().strip("\n")[8:] with open(data_fir + text, 'r', encoding='utf-8') as text_file: texts = text_file.readlines() text = texts[transcript_row].strip() text = start_sign + " " + text + " " + end_sign len_pair.append(len(text.split(" "))) text_list.append(text) audio_feature_file = spectrum_data_dir + data_name + ".npy" text_token_file = spectrum_data_dir + data_name + "text.npy" audio_feature = wav_to_feature(audio_path, audio_feature_type) len_pair.append(vocab_size if audio_feature.shape[0] > vocab_size else audio_feature.shape[0]) text_file_path_list.append(text_token_file) audio_feature = tf.keras.preprocessing.sequence.pad_sequences( [audio_feature], maxlen=max_time_step, dtype="float32", padding="post") audio_feature = tf.squeeze(audio_feature, axis=0) np.save(file=audio_feature_file, arr=audio_feature) ds_infos_file.write(audio_feature_file + '\t' + text_token_file + "\n") len_list.append(len_pair) count += 1 print('\r已处理并写入音频条数:{}'.format(count), flush=True, end='') if max_treat_data_size == count: break _treat_sentence_and_length(text_list, text_file_path_list, len_list, max_sentence_length, vocab_size, save_length_path, is_train, dict_path, unk_sign) print("\n数据处理完毕,共计{}对语音句子数据".format(count))
def preprocess_librispeech_speech_raw_data(data_path: str, dataset_infos_file: str, max_time_step: int, spectrum_data_dir: str, max_sentence_length: int, vocab_size: int, save_length_path: str = "", start_sign: str = "<start>", end_sign: str = "<end>", unk_sign: str = "<unk>", dict_path: str = "", is_train: bool = True, audio_feature_type: str = "mfcc", max_treat_data_size: int = 0): """ 用于处理librispeech数据集的方法,将数据整理为<音频地址, 句子>的 形式,这样方便后续进行分批读取 :param data_path: 数据存放目录路径 :param dataset_infos_file: 保存处理之后的数据路径 :param max_time_step: 最大音频补齐长度 :param save_length_path: 保存样本长度文件路径 :param max_sentence_length: 文本序列最大长度 :param vocab_size: 词汇大小 :param spectrum_data_dir: 保存处理后的音频特征数据目录 :param start_sign: 句子开始标记 :param end_sign: 句子结束标记 :param unk_sign: 未登录词 :param dict_path: 字典路径,若使用phoneme则不用传 :param is_train: 处理的是否是训练数据 :param audio_feature_type: 特征类型 :param max_treat_data_size: 最大处理数据,若为0,则全部数据 :return: 无返回值 """ _check_and_create_file(data_path, spectrum_data_dir) count = 0 len_list = [] text_list = [] text_file_path_list = [] with open(dataset_infos_file, 'w', encoding='utf-8') as ds_infos_file: first_folders = os.listdir(data_path) for first_folder in first_folders: second_folders = os.listdir(os.path.join(data_path, first_folder)) for second_folder in second_folders: second_dir = os.path.join(data_path, first_folder, second_folder) with open(os.path.join( data_path, first_folder, second_folder, first_folder + "-" + second_folder + ".trans.txt"), "r", encoding="utf-8") as trans_file: for line in trans_file: line = line.strip("\n").strip() if line == "": continue len_pair = [] line = line.split(" ", 1) audio_path = os.path.join(second_dir, line[0] + ".flac") audio_feature_file = spectrum_data_dir + line[ 0] + ".npy" text_token_file = spectrum_data_dir + line[ 0] + "text.npy" text_file_path_list.append(text_token_file) text = start_sign + " " + line[1].lower( ) + " " + end_sign len_pair.append(len(text.split(" "))) text_list.append(text) audio_feature = wav_to_feature(audio_path, audio_feature_type) len_pair.append(vocab_size if audio_feature.shape[0] > vocab_size else audio_feature.shape[0]) audio_feature = tf.keras.preprocessing.sequence.pad_sequences( [audio_feature], maxlen=max_time_step, dtype="float32", padding="post") audio_feature = tf.squeeze(audio_feature, axis=0) np.save(file=audio_feature_file, arr=audio_feature) ds_infos_file.write(audio_feature_file + "\t" + spectrum_data_dir + line[0] + "text.npy" + "\n") count += 1 len_list.append(len_pair) print('\r已处理并写入音频条数:{}'.format(count), flush=True, end='') if max_treat_data_size == count: break _treat_sentence_and_length(text_list, text_file_path_list, len_list, max_sentence_length, vocab_size, save_length_path, is_train, dict_path, unk_sign) print("\n数据处理完毕,共计{}对语音句子数据".format(count))
def recognize(model: tf.keras.Model, audio_feature_type: str, start_sign: str, unk_sign: str, end_sign: str, w: int, beam_size: int, record_path: str, max_length: int, max_sentence_length: int, dict_path: str): """ 语音识别模块 :param model: 模型 :param audio_feature_type: 特征类型 :param start_sign: 开始标记 :param end_sign: 结束标记 :param unk_sign: 未登录词 :param w: BiLSTM单元数 :param beam_size: Beam Size :param record_path: 录音保存路径 :param max_length: 最大音频补齐长度 :param max_sentence_length: 最大句子长度 :param dict_path: 字典保存路径 :return: 无返回值 """ tokenizer = load_tokenizer(dict_path=dict_path) enc_hidden = tf.zeros((1, w)) dec_input = tf.expand_dims([tokenizer.word_index.get('<start>')], 1) beam_search = BeamSearch(beam_size=beam_size, max_length=max_sentence_length, worst_score=0) while True: try: record_duration = int(input("请设定录音时长(秒, 负数结束,0则继续输入音频路径):")) except BaseException: print("录音时长只能为int数值") else: if record_duration < 0: break if not os.path.exists(record_path): os.makedirs(record_path) # 录音 if record_duration == 0: record_path = input("请输入音频路径:") else: record_path = record_path + time.strftime( "%Y_%m_%d_%H_%M_%S_", time.localtime(time.time())) + ".wav" record(record_path, record_duration) # 加载录音数据并预测 audio_feature = wav_to_feature(record_path, audio_feature_type) audio_feature = audio_feature[:max_length, :] input_tensor = tf.keras.preprocessing.sequence.pad_sequences( [audio_feature], padding='post', maxlen=max_length, dtype='float32') beam_search.reset(inputs=input_tensor, dec_input=dec_input) decoder_input = dec_input for t in range(1, max_sentence_length): decoder_input = decoder_input[:, -1:] predictions, _ = model(input_tensor, enc_hidden, decoder_input) predictions = tf.nn.softmax(predictions) beam_search.expand(predictions=predictions, end_sign=tokenizer.word_index.get(end_sign)) if beam_search.beam_size == 0: break input_tensor, decoder_input = beam_search.get_search_inputs() beam_search_result = beam_search.get_result(top_k=3) result = '' # 从容器中抽取序列,生成最终结果 for i in range(len(beam_search_result)): temp = beam_search_result[i].numpy() text = tokenizer.sequences_to_texts(temp)[0] text = text.replace(start_sign, '').replace(end_sign, '').replace(' ', '') result = '<' + text + '>' + result print("识别句子为:{}".format(result))
def preprocess_thchs30_speech_raw_data(data_path: str, dataset_infos_file: str, max_length: int, spectrum_data_dir: str, audio_feature_type: str = "mfcc", transcript_row: int = 0, start_sign: str = "<start>", end_sign: str = "<end>"): """ 用于处理thchs30数据集的方法,将数据整理为<音频地址, 句子>的 形式,这样方便后续进行分批读取 :param data_path: 数据存放目录路径 :param dataset_infos_file: 保存处理之后的数据路径 :param max_length: 最大音频补齐长度 :param spectrum_data_dir: 保存处理后的音频特征数据目录 :param audio_feature_type: 特征类型 :param transcript_row: 使用文本数据中的第几行,第一行文字,第二行拼音,第三行音节 :param start_sign: 句子开始标记 :param end_sign: 句子结束标记 :return: 无返回值 """ if not os.path.exists(data_path): print("thchs30数据集不存在,请检查重试") exit(0) data_list = os.listdir(data_path) if not os.path.exists(spectrum_data_dir): os.makedirs(spectrum_data_dir) count = 0 with open(dataset_infos_file, 'w', encoding='utf-8') as ds_infos_file: for data_name in data_list: if os.path.splitext(data_name)[1] == ".wav": # 音频文件 audio_path = data_path + data_name # 对应的文本 text_file_name = data_path + data_name + ".trn" if not os.path.exists(text_file_name): print("{}文本数据不完整,请检查后重试".format(text_file_name)) exit(0) with open(text_file_name, 'r', encoding='utf-8') as text_file: texts = text_file.readlines() text = texts[transcript_row].strip() text = start_sign + " " + text + " " + end_sign audio_feature_file = spectrum_data_dir + data_name + ".npy" audio_feature = wav_to_feature(audio_path, audio_feature_type) audio_feature = tf.expand_dims(audio_feature, axis=0) audio_feature = tf.keras.preprocessing.sequence.pad_sequences( audio_feature, maxlen=max_length, dtype="float32", padding="post") audio_feature = tf.squeeze(audio_feature, axis=0) np.save(file=audio_feature_file, arr=audio_feature) ds_infos_file.write(audio_feature_file + '\t' + text + "\n") count += 1 print('\r已处理音频句子对数:{}'.format(count), flush=True, end='') print("\n数据处理完毕,共计{}条语音数据".format(count))
def preprocess_librispeech_speech_raw_data(data_path: str, dataset_infos_file: str, max_length: int, spectrum_data_dir: str, audio_feature_type: str = "mfcc"): """ 用于处理librispeech数据集的方法,将数据整理为<音频地址, 句子>的 形式,这样方便后续进行分批读取 :param data_path: 数据存放目录路径 :param dataset_infos_file: 保存处理之后的数据路径 :param max_length: 最大音频补齐长度 :param spectrum_data_dir: 保存处理后的音频特征数据目录 :param audio_feature_type: 特征类型 :return: 无返回值 """ if not os.path.exists(data_path): print("thchs30数据集不存在,请检查重试") exit(0) if not os.path.exists(spectrum_data_dir): os.makedirs(spectrum_data_dir) count = 0 with open(dataset_infos_file, 'w', encoding='utf-8') as ds_infos_file: first_folders = os.listdir(data_path) for first_folder in first_folders: second_folders = os.listdir(os.path.join(data_path, first_folder)) for second_folder in second_folders: second_dir = os.path.join(data_path, first_folder, second_folder) with open(os.path.join( data_path, first_folder, second_folder, first_folder + "-" + second_folder + ".trans.txt"), "r", encoding="utf-8") as trans_file: for line in trans_file: line = line.strip("\n").strip() if line == "": continue line = line.split(" ", 1) audio_path = os.path.join(second_dir, line[0] + ".flac") audio_feature_file = spectrum_data_dir + line[ 0] + ".npy" audio_feature = wav_to_feature(audio_path, audio_feature_type) audio_feature = tf.keras.preprocessing.sequence.pad_sequences( audio_feature, maxlen=max_length, dtype="float32", padding="post") np.save(file=audio_feature_file, arr=audio_feature) ds_infos_file.write(audio_feature_file + "\t" + line[1] + "\n") count += 1 print('\r已处理音频句子对数:{}'.format(count), flush=True, end='') print("\n数据处理完毕,共计{}条语音数据".format(count))