def joint_feature():
    joint_root = 'myjoint/'  # TODO: Change to joint root
    feature_save_root = '/media/dapeng/Downloads/DataSet/Audio/Chinese/joint_feature'
    subdirs = ['train', 'dev', 'test']
    left_context_width = 3
    right_context_width = 0
    subsample = 3
    feature_dim = 128
    for subdir in subdirs:
        part = 0
        num = 0
        limit = 1000
        feats_file = os.path.join(joint_root, subdir, 'feats.scp')
        feats_feature = os.path.join(joint_root, subdir, 'feats_feature.scp')
        save_path = ''
        with open(feats_file, 'r', encoding='utf-8') as rf:
            lines = rf.readlines()
        with open(feats_feature, 'w', encoding='utf-8') as wf:
            for line in tqdm(lines):
                if num % limit == 0:
                    save_path = os.path.join(feature_save_root, subdir, str(part))
                    os.makedirs(save_path)
                    part += 1
                parts = line.strip().split(' ')
                name = parts[0]
                path = parts[1]
                wave_data, frame_rate = read_wave_from_file(path)
                features = get_feature(wave_data, frame_rate, feature_dim)
                features = concat_frame(features, left_context_width, right_context_width)
                features = subsampling(features, subsample)
                save_file = os.path.join(save_path, name + '.npy')
                np.save(save_file, features)
                wf.writelines(name + ' ' + save_file + '\n')
                num += 1
        delete_final_line(feats_feature)
示例#2
0
def pred():
    model, vocab = init_model()

    audio, fr = read_wave_from_file(WAVE_OUTPUT_FILENAME)
    feature = get_feature(audio, fr)
    feature = concat_frame(feature, 3, 0)
    feature = subsampling(feature, 3)
    feature = torch.from_numpy(feature)
    feature = torch.unsqueeze(feature, 0)
    feature = feature.cuda()
    len = feature.shape[1]
    len = torch.tensor([len])
    len = len.cuda()
    audio_mask = context_mask(feature)[:, :, None]  # 流式语音识别
    # preds = model.recognize(feature, len, audio_mask)
    preds = model.recognize_beam_search(feature, len, audio_mask)
    preds = dict_map(preds, vocab)
    groundtruth = ["疑点之一美方对境内疫情发展时时间线一直讳莫如深唯恐避之不及这不由令人质疑其疫情爆发的时间起点疑点之二"]
    # groundtruth = ["那叫名人呢干嘛要划类啊一分类就有就有帮派了嘛人不要那么化类就是会有对立面不好所以我说通常有命题的话题都不要提"]
    res = ''.join(preds[0])
    dist, num = computer_cer([res], groundtruth)
    print(dist / num, res)
    def __getitem__(self, index):

        audio_path = self.df.iloc[index, 0]
        label = self.df.iloc[index, 1]

        targets = np.array(self.encode(label))
        wave_data, frame_rate = read_wave_from_file(audio_path)
        # 数据增强
        # wave_data = audio_augment(wave_data)
        # 特征提取
        # features = get_feature(wave_data, frame_rate, self.feature_dim)
        features = get_feature2(wave_data, frame_rate, self.feature_dim)
        # features = np.load(feats_scp)
        features = concat_frame(features, self.left_context_width,
                                self.right_context_width)
        features = subsampling(features, self.subsample)

        inputs_length = np.array(features.shape[0]).astype(np.int64)
        targets_length = np.array(targets.shape[0]).astype(np.int64)

        features = self.pad(features).astype(np.float32)
        targets = self.pad(targets).astype(np.int64).reshape(-1)

        return features, inputs_length, targets, targets_length
示例#4
0
    def start_rec(self):
        self.stream = self.pa.open(format=self.pa.get_format_from_width(
            self.sample_width),
                                   channels=self.channels,
                                   rate=self.rate,
                                   frames_per_buffer=self.chunk,
                                   input=True,
                                   output=False,
                                   stream_callback=self.__callback)

        self.stream.start_stream()
        print("开始录音并识别")

        zero_token = torch.tensor([[0]], dtype=torch.long)
        zero_token = zero_token.cuda()
        dec_state = self.model.decoder(zero_token)
        last_clip = False
        blank_frame = 0
        # todo:流式识别具体过程
        while True:  # 第一层窗口
            # print('win_audio_position, ', self.win_audio_position)
            # print('max_frame_num, ', self.max_frame_num)
            # 有足够的语音使得窗口能够移动 or 音频录制结束,剩余音频不足以移动窗口
            if self.win_audio_position + self.win_audio <= self.frame_num \
                    or self.win_audio_position + self.win_audio >= self.max_frame_num != 0:

                # 特征提取平滑
                if self.win_audio_position + self.win_audio >= self.max_frame_num != 0:
                    print('最后录音片段:', self.win_audio_position, " : ",
                          self.frame_num)
                    last_clip = True
                    win_audio = self.audio_data[self.win_audio_position:self.
                                                frame_num]
                else:
                    win_audio = self.audio_data[self.win_audio_position:self.
                                                win_audio_position +
                                                self.win_audio]
                win_audio_feature = get_feature(win_audio, self.rate, 128)
                win_audio_feature = win_audio_feature[:
                                                      -3, :]  # 舍弃最后3帧,得到97帧,因为这3帧音频数据不完整。通过(窗口移动<窗口)弥补这3帧(音频平滑过渡)
                win_audio_feature_len = win_audio_feature.shape[0]
                self.feature_log_mel = np.concatenate(
                    (self.feature_log_mel, win_audio_feature), axis=0)
                # print('1', self.feature_log_mel.shape)
                # print('2', win_audio_feature.shape)

                # 堆叠平滑
                win_audio_feature = self.feature_log_mel[
                    -3 -
                    win_audio_feature_len:, :]  # 往前多拿3帧,保证有前面的历史信息(堆叠平滑过渡)
                # print('3', win_audio_feature.shape)
                win_audio_feature = concat_frame(win_audio_feature, 3, 0)
                win_audio_feature = win_audio_feature[
                    3:, :]  # 去掉往前多拿3帧,以这三帧为中心的帧并不需要(堆叠平滑过渡)
                len_feature_concat_before = self.feature_concat.shape[0]
                self.feature_concat = np.concatenate(
                    (self.feature_concat, win_audio_feature),
                    axis=0)  # 把当前时间窗口的所有堆叠帧加入到self.feature_concat
                # 下采样平滑
                if len_feature_concat_before % 3 == 0:
                    win_audio_feature = subsampling(
                        self.feature_concat[len_feature_concat_before:, :], 3)
                elif len_feature_concat_before % 3 == 1:
                    win_audio_feature = subsampling(
                        self.feature_concat[len_feature_concat_before + 2:, :],
                        3)
                else:
                    win_audio_feature = subsampling(
                        self.feature_concat[len_feature_concat_before + 1:, :],
                        3)
                self.feature_subsample = np.concatenate(
                    (self.feature_subsample, win_audio_feature), axis=0)

                # 特征滑动窗口
                len_feature_subsample = self.feature_subsample.shape[0]
                # print('4', len_feature_subsample)
                # 有足够的未来信息可以处理/最后一部分
                if len_feature_subsample - self.win_feature_position > self.right_context_len or last_clip:
                    # print('len_feature_subsample', len_feature_subsample)
                    # print('win_feature_position', self.win_feature_position)
                    left_frame = self.left_context_len  # 正常左窗口
                    right_frame = self.right_context_len  # 正常右窗口
                    start = self.win_feature_position - left_frame  # 增加历史帧,确定开始位置
                    if start < 0:
                        # print('历史帧不足')
                        left_frame = self.win_feature_position
                        start = 0
                    end = len_feature_subsample  # 当前的最大帧数
                    if last_clip:  # 录制结束,最后一段
                        right_frame = 0
                    win_audio_feature = self.feature_subsample[start:end, :]
                    # 扩展批次维度
                    win_audio_feature = np.expand_dims(win_audio_feature,
                                                       axis=0)
                    win_audio_feature = torch.from_numpy(
                        win_audio_feature).cuda()
                    # 流式mask
                    audio_mask = context_mask(
                        win_audio_feature,
                        left_context=self.left_context,
                        right_context=self.right_context)[:, :, None].cuda()
                    win_enc_states = self.model.encoder(
                        win_audio_feature, audio_mask)

                    effect_start = left_frame
                    effect_end = -right_frame
                    effect_win_enc_states = win_enc_states[:, effect_start:
                                                           effect_end, :]
                    effect_len = effect_win_enc_states.shape[1]

                    enc_states_len = effect_win_enc_states.shape[1]
                    for t in range(enc_states_len):
                        logits = self.model.joint(
                            effect_win_enc_states[:, t, :].view(-1),
                            dec_state.view(-1))
                        out = torch.nn.functional.softmax(logits,
                                                          dim=0).detach()
                        pred = torch.argmax(out, dim=0)
                        pred = int(pred.item())
                        if pred != 0:
                            if blank_frame >= 15:  # 分句
                                self.text.insert('end', '\n')
                                self.text.update()
                                # self.result.clear()
                            self.result.append(pred)
                            word = self.dictionary[pred]
                            self.text.insert('end', word)
                            self.text.update()
                            if len(self.result) > 40:  #
                                effect_token = self.result[-40:]
                            else:
                                effect_token = self.result
                            token = torch.tensor([effect_token],
                                                 dtype=torch.long)
                            token = token.cuda()
                            dec_state = self.model.decoder(
                                token)[:, -1, :]  # 历史信息输入,但是只取最后一个输出
                            print(blank_frame, word)
                            blank_frame = 0
                        elif pred == 0 and len(self.result) > 0:
                            blank_frame += 1
                    # print('effect_start:', effect_start)
                    # print('effect_end:', effect_end)
                    # print('effect_len:', effect_len)
                    self.win_feature_position += effect_len
                # 移动音频窗口
                self.win_audio_position += self.audio_step

            if not self.recording:
                self.max_frame_num = self.frame_num

            if last_clip:
                break

        print('识别结束:', ''.join([self.dictionary[x] for x in self.result]))
        # 参数恢复
        self.reset_parameter()
        print("重置参数")
示例#5
0
config_file = open("config/joint_streaming.yaml")
config = AttrDict(yaml.load(config_file, Loader=yaml.FullLoader))
model = Transducer(config.model)

checkpoint = torch.load(config.training.load_model)
model.encoder.load_state_dict(checkpoint['encoder'])
model.decoder.load_state_dict(checkpoint['decoder'])
model.joint.load_state_dict(checkpoint['joint'])
print('加载模型')
model.eval()

# 获取音频特征
audio, fr = read_wave_from_file(WAVE_OUTPUT_FILENAME)
feature = get_feature(audio, fr)
feature = concat_frame(feature, 3, 0)
feature = subsampling(feature, 3)
feature = torch.from_numpy(feature)
feature = torch.unsqueeze(feature, 0)
# feature = feature[:, :5, :]
print(feature.shape)

mask = context_mask(feature)[:, :, None]
print(mask.shape)

# torch.jit.script
# script_module = torch.jit.script(model)
# script_module.save('model.pt')

# TODO:transformer.py 158行左右,为了导出pytorch脚本,不能有if_else语句,因此默认设置为else,即音频长度不超过410标签长度不超过42
# encoder_script = torch.jit.trace(model.encoder, (feature, mask))
# encoder_script.save('encoder.pt')
    def start_rec(self):
        self.stream = self.pa.open(format=self.pa.get_format_from_width(
            self.sample_width),
                                   channels=self.channels,
                                   rate=self.rate,
                                   frames_per_buffer=self.chunk,
                                   input=True,
                                   output=False,
                                   stream_callback=self.__callback)

        self.stream.start_stream()
        print("开始录音并识别")

        zero_token = torch.tensor([[0]], dtype=torch.long)
        zero_token = zero_token.cuda()
        dec_state = self.model.decoder(zero_token)
        # todo:流式识别具体过程
        while True:  # 第一层窗口
            if self.win_audio_position + self.win_audio <= self.frame_num \
                    or self.win_audio_position + self.win_audio > self.max_frame_num:  # 有足够的语音使得窗口能够移动 or 音频录制结束,剩余音频不足以移动窗口

                # 特征提取平滑
                if self.win_audio_position + self.win_audio > self.max_frame_num:
                    print('最后录音片段:', self.win_audio_position, " : ",
                          self.frame_num)
                    win_audio = self.audio_data[self.win_audio_position:self.
                                                frame_num]
                else:
                    win_audio = self.audio_data[self.win_audio_position:self.
                                                win_audio_position +
                                                self.win_audio]
                win_audio_feature = get_feature(win_audio, self.rate, 128)
                win_audio_feature = win_audio_feature[:
                                                      -3, :]  # 舍弃最后3帧,得到97帧,因为这3帧音频数据不完整。通过(窗口移动<窗口)弥补这3帧(音频平滑过渡)
                win_audio_feature_len = win_audio_feature.shape[0]
                self.feature_log_mel = np.concatenate(
                    (self.feature_log_mel, win_audio_feature), axis=0)
                # print('1', self.feature_log_mel.shape)
                # print('2', win_audio_feature.shape)

                # 堆叠平滑
                win_audio_feature = self.feature_log_mel[
                    -3 -
                    win_audio_feature_len:, :]  # 往前多拿3帧,保证有前面的历史信息(堆叠平滑过渡)
                # print('3', win_audio_feature.shape)
                win_audio_feature = concat_frame(win_audio_feature, 3, 0)
                win_audio_feature = win_audio_feature[
                    3:, :]  # 去掉往前多拿3帧,以这三帧为中心的帧并不需要(堆叠平滑过渡)
                len_feature_concat_before = self.feature_concat.shape[0]
                self.feature_concat = np.concatenate(
                    (self.feature_concat, win_audio_feature),
                    axis=0)  # 把当前时间窗口的所有堆叠帧加入到self.feature_concat
                # 下采样平滑
                if len_feature_concat_before % 3 == 0:
                    win_audio_feature = subsampling(
                        self.feature_concat[len_feature_concat_before:, :], 3)
                elif len_feature_concat_before % 3 == 1:
                    win_audio_feature = subsampling(
                        self.feature_concat[len_feature_concat_before + 2:, :],
                        3)
                else:
                    win_audio_feature = subsampling(
                        self.feature_concat[len_feature_concat_before + 1:, :],
                        3)
                self.feature_subsample = np.concatenate(
                    (self.feature_subsample, win_audio_feature), axis=0)

                # 特征滑动窗口
                len_feature_subsample = self.feature_subsample.shape[0]
                while True:
                    if self.win_feature_position + self.win_len <= len_feature_subsample:  # 满足最低识别帧数
                        start = self.win_feature_position
                        end = self.win_feature_position + self.win_len
                        win_audio_feature = self.feature_subsample[
                            start:end, :]
                        # 扩展批次维度
                        win_audio_feature = np.expand_dims(win_audio_feature,
                                                           axis=0)
                        win_audio_feature = torch.from_numpy(
                            win_audio_feature).cuda()
                        # 流式mask
                        audio_mask = context_mask(
                            win_audio_feature,
                            left_context=self.left_context,
                            right_context=self.right_context)[:, :,
                                                              None].cuda()
                        win_enc_states = self.model.encoder(
                            win_audio_feature, audio_mask)
                        effect_start = -self.min_win
                        effect_end = -self.min_win + self.pred_frame
                        effect_win_enc_states = win_enc_states[:, effect_start:
                                                               effect_end, :]

                        enc_states_len = effect_win_enc_states.shape[1]
                        for t in range(enc_states_len):
                            logits = self.model.joint(
                                effect_win_enc_states[:, t, :].view(-1),
                                dec_state.view(-1))
                            out = torch.nn.functional.softmax(logits,
                                                              dim=0).detach()
                            pred = torch.argmax(out, dim=0)
                            pred = int(pred.item())
                            if pred != 0:
                                self.result.append(pred)
                                word = self.dictionary[pred]
                                self.text.insert('end', word)
                                self.text.update()
                                result_len = len(self.result)
                                if result_len > 40:
                                    effect_token = self.result[-40:]
                                else:
                                    effect_token = self.result
                                token = torch.tensor([effect_token],
                                                     dtype=torch.long)
                                token = token.cuda()
                                dec_state = self.model.decoder(
                                    token)[:, -1, :]  # 历史信息输入,但是只取最后一个输出
                        # 特征窗口移动
                        if self.win_len < self.max_win:
                            self.win_len += self.pred_frame
                        else:
                            self.win_feature_position += self.pred_frame
                    else:
                        break
                # 移动音频窗口
                self.win_audio_position += self.audio_step
            elif self.win_audio_position + self.win_audio > self.max_frame_num:  # 录制完成,剩余音频不足以滑动音频窗口
                pass

            if self.win_audio_position >= self.max_frame_num:
                print("over")
                break

        # 循环监听音频流是否录制完成 用上面的替换
        # while self.stream.is_active():
        #     time.sleep(1)
        #     self.train.insert('end', '你')
        #     self.train.update()

        self.stream.stop_stream()
        self.stream.close()
        print("结束录音,识别结束")
        print('识别结果:', ''.join([self.dictionary[x] for x in self.result]))
        file_name = self.save_audio()
        print('保存音频到:', file_name)
        self.start_button.config(state=tk.ACTIVE)
        self.start_button.update()
        # 参数恢复
        self.reset_parameter()
        print("Reset parameters")