Python wav_to_feature 예제들, hlp.stt.utils.audio_process.wav_to_feature Python 예제들

예제 #1

0

파일 보기

파일: module.py 프로젝트: DengBoCong/hlp

def recognize(model: tf.keras.Model, audio_feature_type: str, start_sign: str,
              unk_sign: str, end_sign: str, record_path: str, max_length: int,
              dict_path: str):
    """
    语音识别模块
    :param model: 模型
    :param audio_feature_type: 特征类型
    :param start_sign: 开始标记
    :param end_sign: 结束标记
    :param unk_sign: 未登录词
    :param record_path: 录音保存路径
    :param max_length: 最大音频补齐长度
    :param dict_path: 字典保存路径
    :return: 无返回值
    """
    while True:
        try:
            record_duration = int(input("请设定录音时长(秒, 负数结束，0则继续输入音频路径):"))
        except BaseException:
            print("录音时长只能为int数值")
        else:
            if record_duration < 0:
                break
            if not os.path.exists(record_path):
                os.makedirs(record_path)
            # 录音
            if record_duration == 0:
                record_path = input("请输入音频路径：")
            else:
                record_path = record_path + time.strftime(
                    "%Y_%m_%d_%H_%M_%S_", time.localtime(time.time())) + ".wav"
                record(record_path, record_duration)

            # 加载录音数据并预测
            audio_feature = wav_to_feature(record_path, audio_feature_type)
            audio_feature = audio_feature[:max_length, :]
            input_tensor = tf.keras.preprocessing.sequence.pad_sequences(
                [audio_feature],
                padding='post',
                maxlen=max_length,
                dtype='float32')
            predictions = model(input_tensor)
            ctc_input_length = compute_ctc_input_length(
                input_tensor.shape[1], predictions.shape[1],
                tf.convert_to_tensor([[len(audio_feature)]]))

            output = tf.keras.backend.ctc_decode(
                y_pred=predictions,
                input_length=tf.reshape(ctc_input_length,
                                        [ctc_input_length.shape[0]]),
                greedy=True)

            tokenizer = load_tokenizer(dict_path=dict_path)

            sentence = tokenizer.sequences_to_texts(output[0][0].numpy())
            sentence = sentence[0].replace(start_sign,
                                           '').replace(end_sign,
                                                       '').replace(' ', '')
            print("Output:", sentence)

예제 #2

0

파일 보기

def recognize(encoder: tf.keras.Model, decoder: tf.keras.Model, beam_size: int,
              audio_feature_type: str, max_length: int,
              max_sentence_length: int, dict_path: str):
    """
    语音识别模块
    :param encoder: 模型的encoder
    :param decoder: 模型的decoder
    :param beam_size: beam_size
    :param audio_feature_type: 特征类型
    :param max_length: 最大音频补齐长度
    :param max_sentence_length: 最大音频补齐长度
    :param dict_path: 字典保存路径
    """
    beam_search_container = BeamSearch(beam_size=beam_size,
                                       max_length=max_sentence_length,
                                       worst_score=0)

    print("Agent: 你好！结束识别请输入ESC。")
    while True:
        path = input("Path: ")
        if path == "ESC":
            print("Agent: 再见！")
            exit(0)

        if not os.path.exists(path):
            print("音频文件不存在，请重新输入")
            continue

        audio_feature = wav_to_feature(path, audio_feature_type)
        audio_feature = tf.expand_dims(audio_feature, axis=0)
        audio_feature = tf.keras.preprocessing.sequence.pad_sequences(
            audio_feature, maxlen=max_length, dtype="float32", padding="post")

        with open(dict_path, 'r', encoding='utf-8') as dict_file:
            json_string = dict_file.read().strip().strip("\n")
            tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(
                json_string)
        dec_input = tf.expand_dims(
            [tokenizer.word_index.get("<start>", "<unk>")], 0)

        beam_search_container.reset(inputs=audio_feature, dec_input=dec_input)
        for i in range(max_sentence_length):
            enc_outputs, padding_mask = encoder(audio_feature)
            sentence_predictions = decoder(
                inputs=[dec_input, enc_outputs, padding_mask])
            sentence_predictions = tf.nn.softmax(sentence_predictions)
            sentence_predictions = sentence_predictions[:, -1, :]

            beam_search_container.expand(
                predictions=sentence_predictions,
                end_sign=tokenizer.word_index.get("<end>"))
            if beam_search_container.beam_size == 0:
                break

            audio_feature, dec_input = beam_search_container.get_search_inputs(
            )

        beam_search_result = beam_search_container.get_result(top_k=3)
        result = ''
        # 从容器中抽取序列，生成最终结果
        for i in range(len(beam_search_result)):
            temp = beam_search_result[i].numpy()
            text = tokenizer.sequences_to_texts(temp)[0]
            text = text.replace("<start>", '').replace("<end>",
                                                       '').replace(' ', '')
            result = '<' + text + '>' + result

        print("识别句子为：{}".format(result))

    print("识别结束")

예제 #3

0

파일 보기

def preprocess_thchs30_speech_raw_data(data_path: str,
                                       dataset_infos_file: str,
                                       max_time_step: int,
                                       spectrum_data_dir: str,
                                       max_sentence_length: int,
                                       vocab_size: int,
                                       audio_feature_type: str = "mfcc",
                                       save_length_path: str = "",
                                       is_train: bool = True,
                                       transcript_row: int = 0,
                                       start_sign: str = "<start>",
                                       dict_path: str = "",
                                       end_sign: str = "<end>",
                                       unk_sign: str = "<unk>",
                                       max_treat_data_size: int = 0):
    """
    用于处理thchs30数据集的方法，将数据整理为<音频地址, 句子>的
    形式，这样方便后续进行分批读取
    :param data_path: 数据存放目录路径
    :param dataset_infos_file: 保存处理之后的数据路径
    :param max_time_step: 最大音频补齐长度
    :param max_sentence_length: 文本序列最大长度
    :param vocab_size: 词汇大小
    :param spectrum_data_dir: 保存处理后的音频特征数据目录
    :param audio_feature_type: 特征类型
    :param save_length_path: 保存样本长度文件路径
    :param is_train: 处理的是否是训练数据
    :param dict_path: 字典路径，若使用phoneme则不用传
    :param transcript_row: 使用文本数据中的第几行，第一行文字，第二行拼音，第三行音节
    :param start_sign: 句子开始标记
    :param end_sign: 句子结束标记
    :param unk_sign: 未登录词
    :param max_treat_data_size: 最大处理数据，若为0，则全部数据
    :return: 无返回值
    """
    _check_and_create_file(data_path, spectrum_data_dir)

    count = 0
    len_list = []
    text_list = []
    text_file_path_list = []
    data_list = os.listdir(data_path)
    data_fir = data_path[:data_path.find("30")] + "30\\data\\"
    with open(dataset_infos_file, 'w', encoding='utf-8') as ds_infos_file:
        for data_name in data_list:
            if os.path.splitext(data_name)[1] == ".wav":
                len_pair = []
                # 音频文件
                audio_path = data_path + data_name
                # 对应的文本
                text_file_name = data_path + data_name + ".trn"
                if not os.path.exists(text_file_name):
                    print("{}文本数据不完整，请检查后重试".format(text_file_name))
                    exit(0)
                with open(text_file_name, 'r', encoding='utf-8') as text_file:
                    texts = text_file.readlines()
                text = texts[0].strip().strip("\n")[8:]

                with open(data_fir + text, 'r', encoding='utf-8') as text_file:
                    texts = text_file.readlines()
                text = texts[transcript_row].strip()
                text = start_sign + " " + text + " " + end_sign
                len_pair.append(len(text.split(" ")))
                text_list.append(text)

                audio_feature_file = spectrum_data_dir + data_name + ".npy"
                text_token_file = spectrum_data_dir + data_name + "text.npy"
                audio_feature = wav_to_feature(audio_path, audio_feature_type)
                len_pair.append(vocab_size if audio_feature.shape[0] >
                                vocab_size else audio_feature.shape[0])
                text_file_path_list.append(text_token_file)

                audio_feature = tf.keras.preprocessing.sequence.pad_sequences(
                    [audio_feature],
                    maxlen=max_time_step,
                    dtype="float32",
                    padding="post")
                audio_feature = tf.squeeze(audio_feature, axis=0)

                np.save(file=audio_feature_file, arr=audio_feature)
                ds_infos_file.write(audio_feature_file + '\t' +
                                    text_token_file + "\n")

                len_list.append(len_pair)
                count += 1
                print('\r已处理并写入音频条数：{}'.format(count), flush=True, end='')
                if max_treat_data_size == count:
                    break

    _treat_sentence_and_length(text_list, text_file_path_list, len_list,
                               max_sentence_length, vocab_size,
                               save_length_path, is_train, dict_path, unk_sign)

    print("\n数据处理完毕，共计{}对语音句子数据".format(count))

예제 #4

0

파일 보기

def preprocess_librispeech_speech_raw_data(data_path: str,
                                           dataset_infos_file: str,
                                           max_time_step: int,
                                           spectrum_data_dir: str,
                                           max_sentence_length: int,
                                           vocab_size: int,
                                           save_length_path: str = "",
                                           start_sign: str = "<start>",
                                           end_sign: str = "<end>",
                                           unk_sign: str = "<unk>",
                                           dict_path: str = "",
                                           is_train: bool = True,
                                           audio_feature_type: str = "mfcc",
                                           max_treat_data_size: int = 0):
    """
    用于处理librispeech数据集的方法，将数据整理为<音频地址, 句子>的
    形式，这样方便后续进行分批读取
    :param data_path: 数据存放目录路径
    :param dataset_infos_file: 保存处理之后的数据路径
    :param max_time_step: 最大音频补齐长度
    :param save_length_path: 保存样本长度文件路径
    :param max_sentence_length: 文本序列最大长度
    :param vocab_size: 词汇大小
    :param spectrum_data_dir: 保存处理后的音频特征数据目录
    :param start_sign: 句子开始标记
    :param end_sign: 句子结束标记
    :param unk_sign: 未登录词
    :param dict_path: 字典路径，若使用phoneme则不用传
    :param is_train: 处理的是否是训练数据
    :param audio_feature_type: 特征类型
    :param max_treat_data_size: 最大处理数据，若为0，则全部数据
    :return: 无返回值
    """
    _check_and_create_file(data_path, spectrum_data_dir)

    count = 0
    len_list = []
    text_list = []
    text_file_path_list = []
    with open(dataset_infos_file, 'w', encoding='utf-8') as ds_infos_file:
        first_folders = os.listdir(data_path)
        for first_folder in first_folders:
            second_folders = os.listdir(os.path.join(data_path, first_folder))
            for second_folder in second_folders:
                second_dir = os.path.join(data_path, first_folder,
                                          second_folder)

                with open(os.path.join(
                        data_path, first_folder, second_folder,
                        first_folder + "-" + second_folder + ".trans.txt"),
                          "r",
                          encoding="utf-8") as trans_file:
                    for line in trans_file:
                        line = line.strip("\n").strip()

                        if line == "":
                            continue
                        len_pair = []
                        line = line.split(" ", 1)

                        audio_path = os.path.join(second_dir,
                                                  line[0] + ".flac")
                        audio_feature_file = spectrum_data_dir + line[
                            0] + ".npy"
                        text_token_file = spectrum_data_dir + line[
                            0] + "text.npy"
                        text_file_path_list.append(text_token_file)

                        text = start_sign + " " + line[1].lower(
                        ) + " " + end_sign
                        len_pair.append(len(text.split(" ")))
                        text_list.append(text)

                        audio_feature = wav_to_feature(audio_path,
                                                       audio_feature_type)
                        len_pair.append(vocab_size if audio_feature.shape[0] >
                                        vocab_size else audio_feature.shape[0])

                        audio_feature = tf.keras.preprocessing.sequence.pad_sequences(
                            [audio_feature],
                            maxlen=max_time_step,
                            dtype="float32",
                            padding="post")
                        audio_feature = tf.squeeze(audio_feature, axis=0)

                        np.save(file=audio_feature_file, arr=audio_feature)
                        ds_infos_file.write(audio_feature_file + "\t" +
                                            spectrum_data_dir + line[0] +
                                            "text.npy" + "\n")

                        count += 1
                        len_list.append(len_pair)
                        print('\r已处理并写入音频条数：{}'.format(count),
                              flush=True,
                              end='')
                        if max_treat_data_size == count:
                            break

    _treat_sentence_and_length(text_list, text_file_path_list, len_list,
                               max_sentence_length, vocab_size,
                               save_length_path, is_train, dict_path, unk_sign)

    print("\n数据处理完毕，共计{}对语音句子数据".format(count))

예제 #5

0

파일 보기

파일: module.py 프로젝트: DengBoCong/hlp

def recognize(model: tf.keras.Model, audio_feature_type: str, start_sign: str,
              unk_sign: str, end_sign: str, w: int, beam_size: int,
              record_path: str, max_length: int, max_sentence_length: int,
              dict_path: str):
    """
    语音识别模块
    :param model: 模型
    :param audio_feature_type: 特征类型
    :param start_sign: 开始标记
    :param end_sign: 结束标记
    :param unk_sign: 未登录词
    :param w: BiLSTM单元数
    :param beam_size: Beam Size
    :param record_path: 录音保存路径
    :param max_length: 最大音频补齐长度
    :param max_sentence_length: 最大句子长度
    :param dict_path: 字典保存路径
    :return: 无返回值
    """
    tokenizer = load_tokenizer(dict_path=dict_path)
    enc_hidden = tf.zeros((1, w))
    dec_input = tf.expand_dims([tokenizer.word_index.get('<start>')], 1)
    beam_search = BeamSearch(beam_size=beam_size,
                             max_length=max_sentence_length,
                             worst_score=0)

    while True:
        try:
            record_duration = int(input("请设定录音时长(秒, 负数结束，0则继续输入音频路径):"))
        except BaseException:
            print("录音时长只能为int数值")
        else:
            if record_duration < 0:
                break
            if not os.path.exists(record_path):
                os.makedirs(record_path)
            # 录音
            if record_duration == 0:
                record_path = input("请输入音频路径：")
            else:
                record_path = record_path + time.strftime(
                    "%Y_%m_%d_%H_%M_%S_", time.localtime(time.time())) + ".wav"
                record(record_path, record_duration)

            # 加载录音数据并预测
            audio_feature = wav_to_feature(record_path, audio_feature_type)
            audio_feature = audio_feature[:max_length, :]
            input_tensor = tf.keras.preprocessing.sequence.pad_sequences(
                [audio_feature],
                padding='post',
                maxlen=max_length,
                dtype='float32')

            beam_search.reset(inputs=input_tensor, dec_input=dec_input)
            decoder_input = dec_input
            for t in range(1, max_sentence_length):
                decoder_input = decoder_input[:, -1:]
                predictions, _ = model(input_tensor, enc_hidden, decoder_input)
                predictions = tf.nn.softmax(predictions)

                beam_search.expand(predictions=predictions,
                                   end_sign=tokenizer.word_index.get(end_sign))
                if beam_search.beam_size == 0:
                    break

                input_tensor, decoder_input = beam_search.get_search_inputs()

            beam_search_result = beam_search.get_result(top_k=3)
            result = ''
            # 从容器中抽取序列，生成最终结果
            for i in range(len(beam_search_result)):
                temp = beam_search_result[i].numpy()
                text = tokenizer.sequences_to_texts(temp)[0]
                text = text.replace(start_sign,
                                    '').replace(end_sign, '').replace(' ', '')
                result = '<' + text + '>' + result

            print("识别句子为：{}".format(result))

예제 #6

0

파일 보기

파일: pre_treat.py 프로젝트: mxx657845129/hlp

def preprocess_thchs30_speech_raw_data(data_path: str,
                                       dataset_infos_file: str,
                                       max_length: int,
                                       spectrum_data_dir: str,
                                       audio_feature_type: str = "mfcc",
                                       transcript_row: int = 0,
                                       start_sign: str = "<start>",
                                       end_sign: str = "<end>"):
    """
    用于处理thchs30数据集的方法，将数据整理为<音频地址, 句子>的
    形式，这样方便后续进行分批读取
    :param data_path: 数据存放目录路径
    :param dataset_infos_file: 保存处理之后的数据路径
    :param max_length: 最大音频补齐长度
    :param spectrum_data_dir: 保存处理后的音频特征数据目录
    :param audio_feature_type: 特征类型
    :param transcript_row: 使用文本数据中的第几行，第一行文字，第二行拼音，第三行音节
    :param start_sign: 句子开始标记
    :param end_sign: 句子结束标记
    :return: 无返回值
    """
    if not os.path.exists(data_path):
        print("thchs30数据集不存在，请检查重试")
        exit(0)
    data_list = os.listdir(data_path)

    if not os.path.exists(spectrum_data_dir):
        os.makedirs(spectrum_data_dir)

    count = 0
    with open(dataset_infos_file, 'w', encoding='utf-8') as ds_infos_file:
        for data_name in data_list:
            if os.path.splitext(data_name)[1] == ".wav":
                # 音频文件
                audio_path = data_path + data_name
                # 对应的文本
                text_file_name = data_path + data_name + ".trn"
                if not os.path.exists(text_file_name):
                    print("{}文本数据不完整，请检查后重试".format(text_file_name))
                    exit(0)
                with open(text_file_name, 'r', encoding='utf-8') as text_file:
                    texts = text_file.readlines()
                text = texts[transcript_row].strip()
                text = start_sign + " " + text + " " + end_sign

                audio_feature_file = spectrum_data_dir + data_name + ".npy"
                audio_feature = wav_to_feature(audio_path, audio_feature_type)
                audio_feature = tf.expand_dims(audio_feature, axis=0)
                audio_feature = tf.keras.preprocessing.sequence.pad_sequences(
                    audio_feature,
                    maxlen=max_length,
                    dtype="float32",
                    padding="post")
                audio_feature = tf.squeeze(audio_feature, axis=0)

                np.save(file=audio_feature_file, arr=audio_feature)
                ds_infos_file.write(audio_feature_file + '\t' + text + "\n")

                count += 1
                print('\r已处理音频句子对数：{}'.format(count), flush=True, end='')

    print("\n数据处理完毕，共计{}条语音数据".format(count))

예제 #7

0

파일 보기

파일: pre_treat.py 프로젝트: mxx657845129/hlp

def preprocess_librispeech_speech_raw_data(data_path: str,
                                           dataset_infos_file: str,
                                           max_length: int,
                                           spectrum_data_dir: str,
                                           audio_feature_type: str = "mfcc"):
    """
    用于处理librispeech数据集的方法，将数据整理为<音频地址, 句子>的
    形式，这样方便后续进行分批读取
    :param data_path: 数据存放目录路径
    :param dataset_infos_file: 保存处理之后的数据路径
    :param max_length: 最大音频补齐长度
    :param spectrum_data_dir: 保存处理后的音频特征数据目录
    :param audio_feature_type: 特征类型
    :return: 无返回值
    """
    if not os.path.exists(data_path):
        print("thchs30数据集不存在，请检查重试")
        exit(0)

    if not os.path.exists(spectrum_data_dir):
        os.makedirs(spectrum_data_dir)

    count = 0
    with open(dataset_infos_file, 'w', encoding='utf-8') as ds_infos_file:
        first_folders = os.listdir(data_path)
        for first_folder in first_folders:
            second_folders = os.listdir(os.path.join(data_path, first_folder))
            for second_folder in second_folders:
                second_dir = os.path.join(data_path, first_folder,
                                          second_folder)

                with open(os.path.join(
                        data_path, first_folder, second_folder,
                        first_folder + "-" + second_folder + ".trans.txt"),
                          "r",
                          encoding="utf-8") as trans_file:
                    for line in trans_file:
                        line = line.strip("\n").strip()

                        if line == "":
                            continue
                        line = line.split(" ", 1)

                        audio_path = os.path.join(second_dir,
                                                  line[0] + ".flac")

                        audio_feature_file = spectrum_data_dir + line[
                            0] + ".npy"
                        audio_feature = wav_to_feature(audio_path,
                                                       audio_feature_type)
                        audio_feature = tf.keras.preprocessing.sequence.pad_sequences(
                            audio_feature,
                            maxlen=max_length,
                            dtype="float32",
                            padding="post")
                        np.save(file=audio_feature_file, arr=audio_feature)
                        ds_infos_file.write(audio_feature_file + "\t" +
                                            line[1] + "\n")

                        count += 1
                        print('\r已处理音频句子对数：{}'.format(count),
                              flush=True,
                              end='')

    print("\n数据处理完毕，共计{}条语音数据".format(count))