def get_text_from_audio(audio_input_name: str, working_directory: str = WORKING_DIRECTORY): """ Gets text from audio file (using pocketsphinx-python library) Args: Return: list: text from audio file """ # Create a decoder with certain model config = Decoder.default_config() config.set_string('-hmm', os.path.join(SPEECH_MODEL_PATH, 'en-us')) config.set_string('-lm', os.path.join(SPEECH_MODEL_PATH, 'en-us.lm.bin')) config.set_string('-dict', os.path.join(SPEECH_MODEL_PATH, 'cmudict-en-us.dict')) decoder = Decoder(config) # Decode streaming data. decoder.start_utt() with open(os.path.join(working_directory, audio_input_name), 'rb') as stream: while True: buf = stream.read(1024) if buf: decoder.process_raw(buf, False, False) else: break decoder.end_utt() text_from_audio = [seg.word for seg in decoder.seg()] return text_from_audio if text_from_audio else 'Audio file doesn\'t contain words'
def recog_wav(MODELDIR, wavfile): #print(MODELDIR) config = Decoder.default_config() config.set_string('-hmm', os.path.join(MODELDIR, 'en-us')) config.set_string('-lm', os.path.join(MODELDIR, 'en-us.lm.bin')) config.set_string('-dict', os.path.join(MODELDIR, 'cmudict-en-us.dict')) # Decode streaming data. decoder = Decoder(config) start = time.time() decoder.start_utt() wav_stream = open(wavfile, "rb") while True: buffer = wav_stream.read(1024) if buffer: decoder.process_raw(buffer, False, False) else: break decoder.end_utt() duration = time.time() - start print("Duration: " + str(duration)) #Benchmarking for seg in decoder.seg(): print(seg.word)
def get_phonemes(file): # Decode streaming data decoder = Decoder(config) decoder.start_utt() stream = open(file, 'rb') i=0 while True: buf = stream.read(1024) if buf: decoder.process_raw(buf, False, False) else: break decoder.end_utt() Hypothesis = decoder.hyp() return [seg.word for seg in decoder.seg()]
''' Created on Dec 29, 2013 @author: Mindaugas Greibus ''' import sys, os from pocketsphinx import Decoder MODELDIR = "../models" # Create a decoder with certain model config = Decoder.default_config() config.set_string('-hmm', os.path.join(MODELDIR, 'hmm/lt.cd_cont_200/')) config.set_string('-jsgf', os.path.join(MODELDIR, 'lm/robotas.gram')) config.set_string('-dict', os.path.join(MODELDIR, 'dict/robotas.dict')) decoder = Decoder(config) decoder.decode_raw( open(os.path.join(MODELDIR, '../test/audio/varyk_pirmyn-16k.wav'), 'rb')) # Retrieve hypothesis. hypothesis = decoder.hyp() print('Best hypothesis: ', hypothesis.best_score, hypothesis.hypstr) print('Best hypothesis segments: ', [seg.word for seg in decoder.seg()])
''' Created on Dec 29, 2013 @author: Mindaugas Greibus ''' import sys, os from pocketsphinx import Decoder MODELDIR = "../models" # Create a decoder with certain model config = Decoder.default_config() config.set_string('-hmm', os.path.join(MODELDIR, 'hmm/lt.cd_cont_200/')) config.set_string('-jsgf', os.path.join(MODELDIR, 'lm/robotas.gram')) config.set_string('-dict', os.path.join(MODELDIR, 'dict/robotas.dict')) decoder = Decoder(config) decoder.decode_raw(open(os.path.join(MODELDIR, '../test/audio/varyk_pirmyn-16k.wav'), 'rb')) # Retrieve hypothesis. hypothesis = decoder.hyp() print ('Best hypothesis: ', hypothesis.best_score, hypothesis.hypstr) print ('Best hypothesis segments: ', [seg.word for seg in decoder.seg()])
import os import sys from pocketsphinx import DefaultConfig, Decoder, get_model_path, get_data_path model_path = get_model_path() data_path = get_data_path() # Create a decoder with a certain model config = DefaultConfig() config.set_string('-hmm', os.path.join(model_path, 'en-us')) config.set_string('-lm', os.path.join(model_path, 'en-us.lm.bin')) config.set_string('-dict', 'aviation.dict') # set log level #config.set_string("-logfn", "null") decoder = Decoder(config) # Decode streaming data buf = bytearray(1024) with open('subject.wav', 'rb') as f: #should be raw format with right timing decoder.start_utt() while f.readinto(buf): decoder.process_raw(buf, False, False) decoder.end_utt() print('Best hypothesis segments:') for seg in decoder.seg(): if not seg.word == '<sil>': sys.stdout.write(seg.word) sys.stdout.write(' ')
class KeywordSpotting(threading.Thread): def __init__(self, in_fs, out_fs, mute_period_length, kws_frame_length): threading.Thread.__init__(self) # 初始化配置 self.daemon = True self.exit_flag = False self.in_fs = in_fs self.out_fs = out_fs self.mute_period_frames_count = int(in_fs * mute_period_length) self.kws_frames_count = int(in_fs * kws_frame_length) model_path = get_model_path() config = Decoder.default_config() config.set_string('-hmm', os.path.join(model_path, 'en-us')) # 声学模型路径 # config.set_string('-lm',"./tests/7567.lm") config.set_string('-dict', os.path.join(model_path, 'cmudict-en-us.dict')) # 字典路径 config.set_string('-keyphrase', 'alexa') config.set_float('-kws_threshold', 1e-20) config.set_string('-logfn', './logs/tmp') # INFO输出到其他位置 self.decoder = Decoder(config) self.decoder.start_utt() self.start() def run(self): while not self.exit_flag: # 1.从input池中读取一定长度的数据。该过程可能被阻塞,直到池中存在足够多数据。 processed_input_frames = global_var.processed_input_pool.get( self.kws_frames_count) # 2.如果keyword spotting检测出该数据段中存在关键字,则对该数据进行重采样,填充后,存入keyword池 if self._kws(processed_input_frames): global_var.keyword_pool.put( self._padding( Resampler.resampling(processed_input_frames, self.in_fs, self.out_fs), 0, self.mute_period_frames_count)) def stop(self): self.exit_flag = True self.join() def _kws(self, frames): buf = frames.tobytes() if buf: self.decoder.process_raw(buf, False, False) if self.decoder.hyp() != None: print([(seg.word, seg.prob, seg.start_frame, seg.end_frame) for seg in self.decoder.seg()]) print("Detected keyphrase, restarting search") self.decoder.end_utt() self.decoder.start_utt() return True return False def _padding(self, frames, padding_value, padding_num): res = np.pad(frames, (0, padding_num), 'constant', constant_values=(padding_value, padding_value)) return res
config.set_string('-allphone', os.path.join(model_path, 'en-us-phone.lm.bin')) config.set_string('-lm', os.path.join(model_path, 'en-us.lm.bin')) config.set_string('-dict', os.path.join(model_path, 'cmudict-en-us.dict')) config.set_float('-lw', 2.0) config.set_float('-beam', 1e-10) config.set_float('-pbeam', 1e-10) decoder = Decoder(config) # Decode streaming data buf = bytearray(1024) with open(path.join(FilePath, 'amol.wav'), 'rb') as f: decoder.start_utt() while f.readinto(buf): decoder.process_raw(buf, False, False) decoder.end_utt() print('Phonemes: ', [seg.word for seg in decoder.seg()]) print('-' * 28) print('| %5s | %3s | %4s |' % ('start', 'end', 'word')) print('-' * 28) for s in decoder.seg(): s.start_frame print('| %4ss | %4ss | %8s |' % (s.start_frame / fps, s.end_frame / fps, s.word)) print('-' * 28) #hypothesis = decoder.hyp() #print(hypothesis) # plot the graph '''fig = plt.figure(figsize=(12, 6)) plt.subplots_adjust(hspace=0.5) for index, filename in enumerate(recordings, start=1):