Python subsampling示例，tt.utils.subsampling Python示例

示例#1

0

显示文件

文件： data_process.py 项目： yihuan-github/Transformer-Transducer-2

def joint_feature():
    joint_root = 'myjoint/'  # TODO: Change to joint root
    feature_save_root = '/media/dapeng/Downloads/DataSet/Audio/Chinese/joint_feature'
    subdirs = ['train', 'dev', 'test']
    left_context_width = 3
    right_context_width = 0
    subsample = 3
    feature_dim = 128
    for subdir in subdirs:
        part = 0
        num = 0
        limit = 1000
        feats_file = os.path.join(joint_root, subdir, 'feats.scp')
        feats_feature = os.path.join(joint_root, subdir, 'feats_feature.scp')
        save_path = ''
        with open(feats_file, 'r', encoding='utf-8') as rf:
            lines = rf.readlines()
        with open(feats_feature, 'w', encoding='utf-8') as wf:
            for line in tqdm(lines):
                if num % limit == 0:
                    save_path = os.path.join(feature_save_root, subdir, str(part))
                    os.makedirs(save_path)
                    part += 1
                parts = line.strip().split(' ')
                name = parts[0]
                path = parts[1]
                wave_data, frame_rate = read_wave_from_file(path)
                features = get_feature(wave_data, frame_rate, feature_dim)
                features = concat_frame(features, left_context_width, right_context_width)
                features = subsampling(features, subsample)
                save_file = os.path.join(save_path, name + '.npy')
                np.save(save_file, features)
                wf.writelines(name + ' ' + save_file + '\n')
                num += 1
        delete_final_line(feats_feature)

示例#2

0

显示文件

def pred():
    model, vocab = init_model()

    audio, fr = read_wave_from_file(WAVE_OUTPUT_FILENAME)
    feature = get_feature(audio, fr)
    feature = concat_frame(feature, 3, 0)
    feature = subsampling(feature, 3)
    feature = torch.from_numpy(feature)
    feature = torch.unsqueeze(feature, 0)
    feature = feature.cuda()
    len = feature.shape[1]
    len = torch.tensor([len])
    len = len.cuda()
    audio_mask = context_mask(feature)[:, :, None]  # 流式语音识别
    # preds = model.recognize(feature, len, audio_mask)
    preds = model.recognize_beam_search(feature, len, audio_mask)
    preds = dict_map(preds, vocab)
    groundtruth = ["疑点之一美方对境内疫情发展时时间线一直讳莫如深唯恐避之不及这不由令人质疑其疫情爆发的时间起点疑点之二"]
    # groundtruth = ["那叫名人呢干嘛要划类啊一分类就有就有帮派了嘛人不要那么化类就是会有对立面不好所以我说通常有命题的话题都不要提"]
    res = ''.join(preds[0])
    dist, num = computer_cer([res], groundtruth)
    print(dist / num, res)

示例#3

0

显示文件

文件： dataset.py 项目： yihuan-github/Transformer-Transducer-2

    def __getitem__(self, index):

        audio_path = self.df.iloc[index, 0]
        label = self.df.iloc[index, 1]

        targets = np.array(self.encode(label))
        wave_data, frame_rate = read_wave_from_file(audio_path)
        # 数据增强
        # wave_data = audio_augment(wave_data)
        # 特征提取
        # features = get_feature(wave_data, frame_rate, self.feature_dim)
        features = get_feature2(wave_data, frame_rate, self.feature_dim)
        # features = np.load(feats_scp)
        features = concat_frame(features, self.left_context_width,
                                self.right_context_width)
        features = subsampling(features, self.subsample)

        inputs_length = np.array(features.shape[0]).astype(np.int64)
        targets_length = np.array(targets.shape[0]).astype(np.int64)

        features = self.pad(features).astype(np.float32)
        targets = self.pad(targets).astype(np.int64).reshape(-1)

        return features, inputs_length, targets, targets_length

示例#4

0

显示文件

    def start_rec(self):
        self.stream = self.pa.open(format=self.pa.get_format_from_width(
            self.sample_width),
                                   channels=self.channels,
                                   rate=self.rate,
                                   frames_per_buffer=self.chunk,
                                   input=True,
                                   output=False,
                                   stream_callback=self.__callback)

        self.stream.start_stream()
        print("开始录音并识别")

        zero_token = torch.tensor([[0]], dtype=torch.long)
        zero_token = zero_token.cuda()
        dec_state = self.model.decoder(zero_token)
        last_clip = False
        blank_frame = 0
        # todo:流式识别具体过程
        while True:  # 第一层窗口
            # print('win_audio_position, ', self.win_audio_position)
            # print('max_frame_num, ', self.max_frame_num)
            # 有足够的语音使得窗口能够移动 or 音频录制结束，剩余音频不足以移动窗口
            if self.win_audio_position + self.win_audio <= self.frame_num \
                    or self.win_audio_position + self.win_audio >= self.max_frame_num != 0:

                # 特征提取平滑
                if self.win_audio_position + self.win_audio >= self.max_frame_num != 0:
                    print('最后录音片段:', self.win_audio_position, " : ",
                          self.frame_num)
                    last_clip = True
                    win_audio = self.audio_data[self.win_audio_position:self.
                                                frame_num]
                else:
                    win_audio = self.audio_data[self.win_audio_position:self.
                                                win_audio_position +
                                                self.win_audio]
                win_audio_feature = get_feature(win_audio, self.rate, 128)
                win_audio_feature = win_audio_feature[:
                                                      -3, :]  # 舍弃最后3帧，得到97帧，因为这3帧音频数据不完整。通过（窗口移动<窗口）弥补这3帧（音频平滑过渡）
                win_audio_feature_len = win_audio_feature.shape[0]
                self.feature_log_mel = np.concatenate(
                    (self.feature_log_mel, win_audio_feature), axis=0)
                # print('1', self.feature_log_mel.shape)
                # print('2', win_audio_feature.shape)

                # 堆叠平滑
                win_audio_feature = self.feature_log_mel[
                    -3 -
                    win_audio_feature_len:, :]  # 往前多拿3帧，保证有前面的历史信息（堆叠平滑过渡）
                # print('3', win_audio_feature.shape)
                win_audio_feature = concat_frame(win_audio_feature, 3, 0)
                win_audio_feature = win_audio_feature[
                    3:, :]  # 去掉往前多拿3帧，以这三帧为中心的帧并不需要（堆叠平滑过渡）
                len_feature_concat_before = self.feature_concat.shape[0]
                self.feature_concat = np.concatenate(
                    (self.feature_concat, win_audio_feature),
                    axis=0)  # 把当前时间窗口的所有堆叠帧加入到self.feature_concat
                # 下采样平滑
                if len_feature_concat_before % 3 == 0:
                    win_audio_feature = subsampling(
                        self.feature_concat[len_feature_concat_before:, :], 3)
                elif len_feature_concat_before % 3 == 1:
                    win_audio_feature = subsampling(
                        self.feature_concat[len_feature_concat_before + 2:, :],
                        3)
                else:
                    win_audio_feature = subsampling(
                        self.feature_concat[len_feature_concat_before + 1:, :],
                        3)
                self.feature_subsample = np.concatenate(
                    (self.feature_subsample, win_audio_feature), axis=0)

                # 特征滑动窗口
                len_feature_subsample = self.feature_subsample.shape[0]
                # print('4', len_feature_subsample)
                # 有足够的未来信息可以处理/最后一部分
                if len_feature_subsample - self.win_feature_position > self.right_context_len or last_clip:
                    # print('len_feature_subsample', len_feature_subsample)
                    # print('win_feature_position', self.win_feature_position)
                    left_frame = self.left_context_len  # 正常左窗口
                    right_frame = self.right_context_len  # 正常右窗口
                    start = self.win_feature_position - left_frame  # 增加历史帧,确定开始位置
                    if start < 0:
                        # print('历史帧不足')
                        left_frame = self.win_feature_position
                        start = 0
                    end = len_feature_subsample  # 当前的最大帧数
                    if last_clip:  # 录制结束，最后一段
                        right_frame = 0
                    win_audio_feature = self.feature_subsample[start:end, :]
                    # 扩展批次维度
                    win_audio_feature = np.expand_dims(win_audio_feature,
                                                       axis=0)
                    win_audio_feature = torch.from_numpy(
                        win_audio_feature).cuda()
                    # 流式mask
                    audio_mask = context_mask(
                        win_audio_feature,
                        left_context=self.left_context,
                        right_context=self.right_context)[:, :, None].cuda()
                    win_enc_states = self.model.encoder(
                        win_audio_feature, audio_mask)

                    effect_start = left_frame
                    effect_end = -right_frame
                    effect_win_enc_states = win_enc_states[:, effect_start:
                                                           effect_end, :]
                    effect_len = effect_win_enc_states.shape[1]

                    enc_states_len = effect_win_enc_states.shape[1]
                    for t in range(enc_states_len):
                        logits = self.model.joint(
                            effect_win_enc_states[:, t, :].view(-1),
                            dec_state.view(-1))
                        out = torch.nn.functional.softmax(logits,
                                                          dim=0).detach()
                        pred = torch.argmax(out, dim=0)
                        pred = int(pred.item())
                        if pred != 0:
                            if blank_frame >= 15:  # 分句
                                self.text.insert('end', '\n')
                                self.text.update()
                                # self.result.clear()
                            self.result.append(pred)
                            word = self.dictionary[pred]
                            self.text.insert('end', word)
                            self.text.update()
                            if len(self.result) > 40:  #
                                effect_token = self.result[-40:]
                            else:
                                effect_token = self.result
                            token = torch.tensor([effect_token],
                                                 dtype=torch.long)
                            token = token.cuda()
                            dec_state = self.model.decoder(
                                token)[:, -1, :]  # 历史信息输入，但是只取最后一个输出
                            print(blank_frame, word)
                            blank_frame = 0
                        elif pred == 0 and len(self.result) > 0:
                            blank_frame += 1
                    # print('effect_start:', effect_start)
                    # print('effect_end:', effect_end)
                    # print('effect_len:', effect_len)
                    self.win_feature_position += effect_len
                # 移动音频窗口
                self.win_audio_position += self.audio_step

            if not self.recording:
                self.max_frame_num = self.frame_num

            if last_clip:
                break

        print('识别结束：', ''.join([self.dictionary[x] for x in self.result]))
        # 参数恢复
        self.reset_parameter()
        print("重置参数")

示例#5

0

显示文件

config_file = open("config/joint_streaming.yaml")
config = AttrDict(yaml.load(config_file, Loader=yaml.FullLoader))
model = Transducer(config.model)

checkpoint = torch.load(config.training.load_model)
model.encoder.load_state_dict(checkpoint['encoder'])
model.decoder.load_state_dict(checkpoint['decoder'])
model.joint.load_state_dict(checkpoint['joint'])
print('加载模型')
model.eval()

# 获取音频特征
audio, fr = read_wave_from_file(WAVE_OUTPUT_FILENAME)
feature = get_feature(audio, fr)
feature = concat_frame(feature, 3, 0)
feature = subsampling(feature, 3)
feature = torch.from_numpy(feature)
feature = torch.unsqueeze(feature, 0)
# feature = feature[:, :5, :]
print(feature.shape)

mask = context_mask(feature)[:, :, None]
print(mask.shape)

# torch.jit.script
# script_module = torch.jit.script(model)
# script_module.save('model.pt')

# TODO：transformer.py 158行左右，为了导出pytorch脚本，不能有if_else语句，因此默认设置为else，即音频长度不超过410标签长度不超过42
# encoder_script = torch.jit.trace(model.encoder, (feature, mask))
# encoder_script.save('encoder.pt')

示例#6

0

显示文件

文件： streamRec.py 项目： yihuan-github/Transformer-Transducer-2

    def start_rec(self):
        self.stream = self.pa.open(format=self.pa.get_format_from_width(
            self.sample_width),
                                   channels=self.channels,
                                   rate=self.rate,
                                   frames_per_buffer=self.chunk,
                                   input=True,
                                   output=False,
                                   stream_callback=self.__callback)

        self.stream.start_stream()
        print("开始录音并识别")

        zero_token = torch.tensor([[0]], dtype=torch.long)
        zero_token = zero_token.cuda()
        dec_state = self.model.decoder(zero_token)
        # todo:流式识别具体过程
        while True:  # 第一层窗口
            if self.win_audio_position + self.win_audio <= self.frame_num \
                    or self.win_audio_position + self.win_audio > self.max_frame_num:  # 有足够的语音使得窗口能够移动 or 音频录制结束，剩余音频不足以移动窗口

                # 特征提取平滑
                if self.win_audio_position + self.win_audio > self.max_frame_num:
                    print('最后录音片段:', self.win_audio_position, " : ",
                          self.frame_num)
                    win_audio = self.audio_data[self.win_audio_position:self.
                                                frame_num]
                else:
                    win_audio = self.audio_data[self.win_audio_position:self.
                                                win_audio_position +
                                                self.win_audio]
                win_audio_feature = get_feature(win_audio, self.rate, 128)
                win_audio_feature = win_audio_feature[:
                                                      -3, :]  # 舍弃最后3帧，得到97帧，因为这3帧音频数据不完整。通过（窗口移动<窗口）弥补这3帧（音频平滑过渡）
                win_audio_feature_len = win_audio_feature.shape[0]
                self.feature_log_mel = np.concatenate(
                    (self.feature_log_mel, win_audio_feature), axis=0)
                # print('1', self.feature_log_mel.shape)
                # print('2', win_audio_feature.shape)

                # 堆叠平滑
                win_audio_feature = self.feature_log_mel[
                    -3 -
                    win_audio_feature_len:, :]  # 往前多拿3帧，保证有前面的历史信息（堆叠平滑过渡）
                # print('3', win_audio_feature.shape)
                win_audio_feature = concat_frame(win_audio_feature, 3, 0)
                win_audio_feature = win_audio_feature[
                    3:, :]  # 去掉往前多拿3帧，以这三帧为中心的帧并不需要（堆叠平滑过渡）
                len_feature_concat_before = self.feature_concat.shape[0]
                self.feature_concat = np.concatenate(
                    (self.feature_concat, win_audio_feature),
                    axis=0)  # 把当前时间窗口的所有堆叠帧加入到self.feature_concat
                # 下采样平滑
                if len_feature_concat_before % 3 == 0:
                    win_audio_feature = subsampling(
                        self.feature_concat[len_feature_concat_before:, :], 3)
                elif len_feature_concat_before % 3 == 1:
                    win_audio_feature = subsampling(
                        self.feature_concat[len_feature_concat_before + 2:, :],
                        3)
                else:
                    win_audio_feature = subsampling(
                        self.feature_concat[len_feature_concat_before + 1:, :],
                        3)
                self.feature_subsample = np.concatenate(
                    (self.feature_subsample, win_audio_feature), axis=0)

                # 特征滑动窗口
                len_feature_subsample = self.feature_subsample.shape[0]
                while True:
                    if self.win_feature_position + self.win_len <= len_feature_subsample:  # 满足最低识别帧数
                        start = self.win_feature_position
                        end = self.win_feature_position + self.win_len
                        win_audio_feature = self.feature_subsample[
                            start:end, :]
                        # 扩展批次维度
                        win_audio_feature = np.expand_dims(win_audio_feature,
                                                           axis=0)
                        win_audio_feature = torch.from_numpy(
                            win_audio_feature).cuda()
                        # 流式mask
                        audio_mask = context_mask(
                            win_audio_feature,
                            left_context=self.left_context,
                            right_context=self.right_context)[:, :,
                                                              None].cuda()
                        win_enc_states = self.model.encoder(
                            win_audio_feature, audio_mask)
                        effect_start = -self.min_win
                        effect_end = -self.min_win + self.pred_frame
                        effect_win_enc_states = win_enc_states[:, effect_start:
                                                               effect_end, :]

                        enc_states_len = effect_win_enc_states.shape[1]
                        for t in range(enc_states_len):
                            logits = self.model.joint(
                                effect_win_enc_states[:, t, :].view(-1),
                                dec_state.view(-1))
                            out = torch.nn.functional.softmax(logits,
                                                              dim=0).detach()
                            pred = torch.argmax(out, dim=0)
                            pred = int(pred.item())
                            if pred != 0:
                                self.result.append(pred)
                                word = self.dictionary[pred]
                                self.text.insert('end', word)
                                self.text.update()
                                result_len = len(self.result)
                                if result_len > 40:
                                    effect_token = self.result[-40:]
                                else:
                                    effect_token = self.result
                                token = torch.tensor([effect_token],
                                                     dtype=torch.long)
                                token = token.cuda()
                                dec_state = self.model.decoder(
                                    token)[:, -1, :]  # 历史信息输入，但是只取最后一个输出
                        # 特征窗口移动
                        if self.win_len < self.max_win:
                            self.win_len += self.pred_frame
                        else:
                            self.win_feature_position += self.pred_frame
                    else:
                        break
                # 移动音频窗口
                self.win_audio_position += self.audio_step
            elif self.win_audio_position + self.win_audio > self.max_frame_num:  # 录制完成，剩余音频不足以滑动音频窗口
                pass

            if self.win_audio_position >= self.max_frame_num:
                print("over")
                break

        # 循环监听音频流是否录制完成 用上面的替换
        # while self.stream.is_active():
        #     time.sleep(1)
        #     self.train.insert('end', '你')
        #     self.train.update()

        self.stream.stop_stream()
        self.stream.close()
        print("结束录音，识别结束")
        print('识别结果：', ''.join([self.dictionary[x] for x in self.result]))
        file_name = self.save_audio()
        print('保存音频到：', file_name)
        self.start_button.config(state=tk.ACTIVE)
        self.start_button.update()
        # 参数恢复
        self.reset_parameter()
        print("Reset parameters")