def voice_recongnition(pcm_file): APP_ID = '16184075' API_KEY = 'OY4u5LVYfQUPB3oLEcfm1DNP' SECRET_KEY = 'm3MQrGkLTXGO3mgnljXtOBSXB8dG8HGp' client = AipSpeech(APP_ID, API_KEY, SECRET_KEY) client.setConnectionTimeoutInMillis(5000) client.setSocketTimeoutInMillis(10000) # 识别本地文件 try: results = client.asr(get_file_content(pcm_file), 'pcm', 16000, { 'dev_pid': 1536, }) except: print("无法连接服务器") return [None] return results['result']
def stringToMp3(strings): per=0 strings_txt = '又是新的一天。主人起床呀,懒虫,起床咯,死肥宅,起床啦。要上课啦!' print(strings[0],strings[1]) APPID = '******' APIKey = '*******' SecretKey = '***********************' aipSpeech = AipSpeech(APPID,APIKey,SecretKey) aipSpeech.setConnectionTimeoutInMillis(5000) aipSpeech.setSocketTimeoutInMillis(100000) result = aipSpeech.synthesis(strings_txt,'zh','1',\ {'vol':10, 'per':'0', 'spd':5 }) if not isinstance(result,dict): with open('test_tmp.mp3','wb') as f: f.write(result) print(dict) result = aipSpeech.synthesis(strings[0],'zh','1',\ {'vol':10, 'per':'0', 'spd':6 }) if not isinstance(result,dict): with open('test_tmp.mp3','ab') as f: f.write(result) print(dict) result = aipSpeech.synthesis(strings[1],'zh','1',\ {'vol':10, 'per':'0', 'spd':6 }) if not isinstance(result,dict): with open('test_tmp.mp3','ab') as f: f.write(result) print(dict)
def init_aip(): global aip_client global aip_client_ocr APP_ID = '11758766' API_KEY = 'I5yAyCHCEw5eQhjG0nuXkgHr' SECRET_KEY = '8OxzRjt1dMrRgtsmTHTLpH1SxWBywoju' aip_client = AipSpeech(APP_ID, API_KEY, SECRET_KEY) #连接超时时间 aip_client.setConnectionTimeoutInMillis(1000) #数据传输超时时间 aip_client.setSocketTimeoutInMillis(3000) APP_ID = '14462175' API_KEY = 'KG8eaIU0ouFCaMLnQZ2u3IX1' SECRET_KEY = 'waX99HpVyaukCX6aruuazGNo6zaVOcVu' aip_client_ocr = AipOcr(APP_ID, API_KEY, SECRET_KEY)
def init_aip(): global aip_client global aip_client_ocr APP_ID = '11758766' API_KEY = 'I5yAyCHCEw5eQhjG0nuXkgHr' SECRET_KEY = '8OxzRjt1dMrRgtsmTHTLpH1SxWBywoju' aip_client = AipSpeech(APP_ID, API_KEY, SECRET_KEY) #连接超时时间 aip_client.setConnectionTimeoutInMillis(1000) #数据传输超时时间 aip_client.setSocketTimeoutInMillis(3000) APP_ID = '14462175' API_KEY = 'KG8eaIU0ouFCaMLnQZ2u3IX1' SECRET_KEY = 'waX99HpVyaukCX6aruuazGNo6zaVOcVu' aip_client_ocr = AipOcr(APP_ID, API_KEY, SECRET_KEY)
class BaiduTTS: def __init__(self): app_id = rospy.get_param("~app_id") app_key = rospy.get_param("~api_key") secret_key = rospy.get_param("~secret_key") timeout = int(rospy.get_param("~timeout")) self.client = AipSpeech(app_id, app_key, secret_key) self.client.setConnectionTimeoutInMillis(timeout) self.client.setSocketTimeoutInMillis(timeout) def tts(self, words): result = self.client.synthesis(words, 'zh', 1, { 'vol': 5, }) if isinstance(result, dict): rospy.logerr(result) return None audio_data = AudioData() audio_data.data = result return audio_data def asr(self, audio_data): res = self.client.asr("".join(audio_data.data), 'pcm', 16000, { 'dev_pid': 1936, }) rospy.loginfo(res) if "err_no" in res and res["err_no"] == 0: rospy.loginfo(res["result"]) return res["result"][0].encode("utf-8") else: if "err_msg" in res: rospy.logerr(res["err_msg"]) if "error_msg" in res: rospy.logerr(res["error_msg"]) return ""
class RcgCore(object): # 不再使用线程 def __init__(self, wav_file, timeout=600, bd_appid=None, bd_api_key=None, bd_secret_key=None, use_pro_api=True): self.wav_file = wav_file self.result = None if bd_appid and bd_api_key and bd_secret_key: self.aip_speech = AipSpeech(config.BD_RCG_APP_ID, config.BD_RCG_API_KEY, config.BD_RCG_SECRET_KEY) else: self.aip_speech = aip_speech self.aip_speech.setConnectionTimeoutInMillis(timeout * 1000) self.aip_speech.setSocketTimeoutInMillis(timeout * 1000) self.use_pro_api = use_pro_api def run(self): if self.use_pro_api: tmp_wav_path = io.BytesIO() utils.wav_8kto16k(self.wav_file,tmp_wav_path) file_content = utils.read(tmp_wav_path, 'rb') else: file_content = utils.read(self.wav_file, 'rb') # print(len(file_content)) max_retry = config.RCG_MAX_RETRY for retry in range(max_retry + 1): try: # 注明pcm而非wav,免去再次百度转换(可在一定情况下避免err3301:音质问题) # 使用1537-8k 30qps测试 if self.use_pro_api: rst = self.aip_speech.asr_pro(file_content, 'pcm', 16000, {'dev_pid': 80001}) else: rst = self.aip_speech.asr(file_content, 'pcm', 8000, {'dev_pid': 1537}) """ dev_pid 语言 模型 是否有标点 备注 1536 普通话(支持简单的英文识别) 搜索模型 无标点 支持自定义词库 1537 普通话(纯中文识别) 输入法模型 有标点 不支持自定义词库 1737 英语 无标点 不支持自定义词库 1637 粤语 有标点 不支持自定义词库 1837 四川话 有标点 不支持自定义词库 1936 普通话远场 远场模型 有标点 不支持 """ if rst['err_no'] == 0: self.result = rst['result'][0] # rcg text logging.debug("Recognition: %s" % self.result) break elif rst['err_no'] == 3304 or rst['err_no'] == '3304': # qps超限,等待一秒 logging.warning('qps超限(等待1秒):%s' % rst.get('err_msg')) time.sleep(1) elif rst['err_no'] == 3301: # 音质差,返回空结果,不再重试 self.result = '' logging.warning('音频质量差:%s' % rst.get('err_msg')) break else: logging.error('识别错误:%s' % rst.get('err_msg')) logging.error(rst) raise Exception('Recognition failed!') except Exception as e: logging.warning('RcgCore: on retry %d:' % retry) logging.warning(e) def get_result(self): return self.result
print('read: ' + filePath) mono = audio.set_frame_rate(16000).set_channels(1) print('set: ' + filePath) mono.export('test.pcm', format="s16le") print('export: ' + filePath) if __name__ == '__main__': APP_ID = '' API_KEY = '' SECRET_KEY = '' client = AipSpeech(APP_ID, API_KEY, SECRET_KEY) client.setConnectionTimeoutInMillis(2000) client.setSocketTimeoutInMillis(60000) path = 'C:\\school\\test.pcm' song = AudioSegment.from_file(path) mss = len(song) start = 0 while start < mss: print(str(start / 1000)[:-2] + ': ', end='') end = start + 30000 if end > mss: end = mss buff = song[start:end].raw_data start = end dic = client.asr(buff, 'pcm', 16000, { 'dev_pid': 1536, })
class Client(object): def __init__(self, app_info_file): ## read app info with open(app_info_file, "rb") as f: self.app_info = json.load(f) APP_ID = self.app_info["app_id"] API_KEY = self.app_info["api_key"] SECRET_KEY = self.app_info["secret_key"] self.client = AipSpeech(APP_ID, API_KEY, SECRET_KEY) self.attributes = ClientAttributes() def setConnectionTimeoutInMillis(self, ms): """ ms: 建立连接的超时时间(单位:毫秒) """ self.client.setConnectionTimeoutInMillis(ms) def setSocketTimeoutInMillis(self, ms): """ ms: 通过打开的连接传输数据的超时时间(单位:毫秒) """ self.client.setSocketTimeoutInMillis(ms) def setAttributes(self, attr): """ attr: dict{str: str|int} """ if "spd" in attr: self.attributes.spd = str(attr["spd"]) if "pit" in attr: self.attributes.pit = str(attr["pit"]) if "vol" in attr: self.attributes.vol = str(attr["vol"]) if "per" in attr: self.attributes.per = str(attr["per"]) def synthesis(self, text): """ Simple synthesis of a sentence Return binary result, if error, print error and return None """ attr = {} if self.attributes.spd: attr["spd"] = self.attributes.spd if self.attributes.pit: attr["pit"] = self.attributes.pit if self.attributes.vol: attr["vol"] = self.attributes.vol if self.attributes.per: attr["per"] = self.attributes.per result = self.client.synthesis(text, 'zh', 1, attr) if not isinstance(result, dict): return result else: print("Error:", result) raise Exception("Stop") def synthesisLongString(self, text): """ Divide text into pieces by end of string indicators to make sure the synthesis shorter than 1024 bytes. Yield a generator of audio pieces """ if utf8len(text) <= 1024: yield self.synthesis(text) return ## helpers for divide string into pieces and synthesis INDIC = "SENTEND" def sep(matchObj): return matchObj.group(1) + INDIC def multiShortSynthesis(text, level=1): first_level_indicator = ".;!? ?。!」「" second_level_indicator = string.punctuation + "," if level == 1: indicators = first_level_indicator elif level == 2: indicators = second_level_indicator if level < 3: proc_text = re.sub("([" + indicators + "])", sep, text) split_proc = proc_text.split(INDIC) text_bytes = [utf8len(string) for string in split_proc] else: split_proc = text text_bytes = [utf8len(char) for char in split_proc] pointer = 0 cur = "" cur_bytes = 0 while pointer < len(split_proc): if text_bytes[pointer] > 1024: if cur_bytes > 0: yield (self.synthesis(cur)) for audio_piece in multiShortSynthesis( split_proc[pointer], level + 1): yield audio_piece elif text_bytes[pointer] <= 1024 - cur_bytes: cur += split_proc[pointer] cur_bytes += text_bytes[pointer] else: yield (self.synthesis(cur)) cur = split_proc[pointer] cur_bytes = text_bytes[pointer] pointer += 1 ## run it for audio_piece in multiShortSynthesis(text, 1): yield audio_piece def synthesisFile(self, fp, storeFp, Range=None): """ fp: input file pointer storeFp: output file pointer """ data = fp.read().decode("utf8").splitlines() total_line = len(data) FROM = 1 TO = total_line if Range is not None: FROM, TO = Range total_line = TO - FROM + 1 cur = "" cur_bytes = 0 for c in range(FROM - 1, TO + 1): line = data[c] if len(line) > 0: line_bytes = utf8len(line) if line_bytes + cur_bytes > 1023: if cur_bytes == 0: for audio_piece in self.synthesisLongString(line): storeFp.write(audio_piece) else: for audio_piece in self.synthesisLongString(cur): storeFp.write(audio_piece) cur = line cur_bytes = line_bytes else: cur += "\n" + line cur_bytes += line_bytes + 1 sys.stdout.write("\rTransited: {0}/{1}%".format( c + 1 - FROM, total_line)) sys.stdout.flush() print("Finished.")
class BaiduSpeech(object): def __init__(self, APP_ID, API_KEY, SECRET_KEY, mode='tts'): call_dict = {'tts':self.tts, 'asr':self.asr} self.client = AipSpeech(APP_ID, API_KEY, SECRET_KEY) self.client.setConnectionTimeoutInMillis(10000) self.client.setSocketTimeoutInMillis(10000) self.wait_time = 0.001 self.call_func = call_dict[mode] @staticmethod def get_file_content(filePath): with open(filePath, 'rb') as fp: return fp.read() def tts(self, texts, param={}): audios = [] for text in texts: wait_time = self.wait_time while True: time.sleep(wait_time) try: audio = self.client.synthesis( text, "zh", 1, param ) if not isinstance(audio, dict): audios.append(audio) break else: # error print("return error:{}".format(audio)) continue except Exception as e: wait_time *= 2 print("Exception occur:{}".format(e)) continue return audios def asr(self, audio_file, param={}): wait_time = self.wait_time audio_format = param.pop('format', 'wav') audio_rate = param.pop('rate', 16000) # dev_pid = param.pop('dev_pid', 1737) # read wav audio_data = self.get_file_content(audio_file) text = None while True: time.sleep(wait_time) try: text_json = self.client.asr( speech=audio_data, format=audio_format, rate=audio_rate, options=param ) if not isinstance(text_json, dict): if text_json.get('err_no', 0) == 0: text = text_json.get('result', '') break else: print("error occur", text_json) else: # error print("return error:{}".format(text_json)) continue except Exception as e: wait_time *= 2 print("Exception occur:{}".format(e)) continue return text def __call__(self, data, param={}): return self.call_func(data, param)