def __getitem__(self, index): wav, transcript = self.idx[index] wav = feature.load_audio(os.path.join(dataPath, wav)) spect = feature.spectrogram(wav) transcript = list( filter(None, [self.labels.get(x) for x in transcript])) return spect, transcript
def _draw_frame(self, framedata): x = np.linspace(0, self.chunk - 1, self.chunk) y = np.frombuffer(self.read(self.chunk), dtype=np.int16) special_flag = False#特殊判断标记,当最后一段音频不足时赋值为真,主要就是针对读取固定长度音频的情况 if len(y) == 0: return if len(y)<self.chunk: y = np.pad(y,(0,self.chunk-len(y)),'constant')#数据维度需要和坐标维度一致 special_flag = True self.data.append(y) if special_flag or self._valid(np.array(self.data[-24::1]).flatten()): #修改语音识别调用方式:这种是在开始记录有效声音后直到准备清理数据时最后用完整数据调用一次 if len(self.data)>4: wav = np.array(self.data).flatten() if False:#本地方式 han = self.yysb.predict(wav) else:#发送到服务器的方式 try: han = requests.post(self.httpService, json={'token':'SR', 'data':wav,'pre_type':'H'}) han.encoding='utf-8' han = han.text except BaseException as e: han = str(e) print('识别汉字:{}'.format(han))#todo:或者给需要的地方 self.data=self.data[-3:] # 音频图(上面的) self.line1.set_data(x, y) # 加窗图(下面的) freqs = np.linspace(0, self.chunk, self.chunk / 2) y = y.astype('float') y = (y - y.mean()) / y.std() xfp = feature.spectrogram(y) self.line2.set_data(freqs, xfp) self._drawn_artists = [self.line1, self.line2]
def __getitem__(self, index): wav, transcript = self.idx[index] wav = feature.load_audio(wav) spect = feature.spectrogram(wav) transcript = list(filter(None, [self.vocabulary.get(x) for x in transcript])) return spect, transcript
def predict(self, path): self.eval() wav = feature.load_audio(path) spec = feature.spectrogram(wav) spec.unsqueeze_(0) x_lens = spec.size(-1) out = self.cnn(spec) out_len = torch.tensor([out.size(-1)]) text = self.decode(out, out_len) self.train() return text[0]
def predict(f): wav = feature.load_audio(f) spec = feature.spectrogram(wav) spec.unsqueeze_(0) with torch.no_grad(): y = model.cnn(spec) y = F.softmax(y, 1) y_len = torch.tensor([y.size(-1)]) y = y.permute(0, 2, 1) # B * T * V print("decoding") out, score, offset, out_len = decoder.decode(y, y_len) return translate(model.vocabulary, out[0][0], out_len[0][0])
def predict(self, path): self.eval() if path.find('.wav'): wav = feature.load_audio(wav_path=path, wav=None) else: wav = feature.load_audio(wav=path) spec = feature.spectrogram(wav) spec.unsqueeze_(0) x_lens = spec.size(-1) out = self.cnn(spec) out_len = torch.tensor([out.size(-1)]) text = self.decode(out, out_len) self.train() return text[0]