Пример #1
0
 def __getitem__(self, index):
     wav, transcript = self.idx[index]
     wav = feature.load_audio(os.path.join(dataPath, wav))
     spect = feature.spectrogram(wav)
     transcript = list(
         filter(None, [self.labels.get(x) for x in transcript]))
     return spect, transcript
Пример #2
0
    def _draw_frame(self, framedata):
        x = np.linspace(0, self.chunk - 1, self.chunk)
        y = np.frombuffer(self.read(self.chunk), dtype=np.int16)
        special_flag = False#特殊判断标记,当最后一段音频不足时赋值为真,主要就是针对读取固定长度音频的情况
        if len(y) == 0:
            return
        if len(y)<self.chunk:
            y = np.pad(y,(0,self.chunk-len(y)),'constant')#数据维度需要和坐标维度一致
            special_flag = True
        self.data.append(y)
        if special_flag or self._valid(np.array(self.data[-24::1]).flatten()):
            #修改语音识别调用方式:这种是在开始记录有效声音后直到准备清理数据时最后用完整数据调用一次
            if len(self.data)>4:
                wav = np.array(self.data).flatten()
                if False:#本地方式
                    han = self.yysb.predict(wav)
                else:#发送到服务器的方式
                    try:
                        han = requests.post(self.httpService, json={'token':'SR', 'data':wav,'pre_type':'H'})
                        han.encoding='utf-8'
                        han = han.text
                    except BaseException as e:
                        han = str(e)
                print('识别汉字:{}'.format(han))#todo:或者给需要的地方

            self.data=self.data[-3:]
        # 音频图(上面的)
        self.line1.set_data(x, y)
        # 加窗图(下面的)
        freqs = np.linspace(0, self.chunk, self.chunk / 2)
        y = y.astype('float')
        y = (y - y.mean()) / y.std()
        xfp = feature.spectrogram(y)
        self.line2.set_data(freqs, xfp)
        self._drawn_artists = [self.line1, self.line2]
Пример #3
0
    def __getitem__(self, index):
        wav, transcript = self.idx[index]
        wav = feature.load_audio(wav)
        spect = feature.spectrogram(wav)
        transcript = list(filter(None, [self.vocabulary.get(x) for x in transcript]))

        return spect, transcript
Пример #4
0
 def predict(self, path):
     self.eval()
     wav = feature.load_audio(path)
     spec = feature.spectrogram(wav)
     spec.unsqueeze_(0)
     x_lens = spec.size(-1)
     out = self.cnn(spec)
     out_len = torch.tensor([out.size(-1)])
     text = self.decode(out, out_len)
     self.train()
     return text[0]
Пример #5
0
def predict(f):
    wav = feature.load_audio(f)
    spec = feature.spectrogram(wav)
    spec.unsqueeze_(0)
    with torch.no_grad():
        y = model.cnn(spec)
        y = F.softmax(y, 1)
    y_len = torch.tensor([y.size(-1)])
    y = y.permute(0, 2, 1)  # B * T * V
    print("decoding")
    out, score, offset, out_len = decoder.decode(y, y_len)
    return translate(model.vocabulary, out[0][0], out_len[0][0])
Пример #6
0
 def predict(self, path):
     self.eval()
     if path.find('.wav'):
         wav = feature.load_audio(wav_path=path, wav=None)
     else:
         wav = feature.load_audio(wav=path)
     spec = feature.spectrogram(wav)
     spec.unsqueeze_(0)
     x_lens = spec.size(-1)
     out = self.cnn(spec)
     out_len = torch.tensor([out.size(-1)])
     text = self.decode(out, out_len)
     self.train()
     return text[0]