def transform_audio_to_text(filename): user = expanduser("~") path = user + "/DTAI_Internship/src/speech_recognizer_node/data/" lm_file = path + "generated_language_model.lm" dict_file = path + "generated_dictionary.dic" hmm_file = user + "/.local/lib/python2.7/site-packages/pocketsphinx/model/en-us" model_path = get_model_path() data_path = get_data_path() config = { 'hmm': os.path.join(model_path, 'en-us'), 'lm': os.path.join(model_path, lm_file), 'dict': os.path.join(model_path, dict_file) } ps = Pocketsphinx(**config) ps.decode(audio_file=os.path.join(data_path, filename), buffer_size=2048, no_search=False, full_utt=False) text = ps.hypothesis() print(text) return text
def get_words_from_file(file_path): """ :param file_path: audio file (must be raw 16khz 16bit) :return: a list of phrases made of words """ model_path = get_model_path() data_path = get_data_path() config = { 'verbose': False, 'audio_file': file_path, 'buffer_size': 2048, 'no_search': False, 'full_utt': False, 'hmm': os.path.join(model_path, 'en-us'), 'lm': os.path.join(model_path, 'en-us.lm.bin'), 'dict': os.path.join(model_path, 'cmudict-en-us.dict') } audio = AudioFile(**config) phrases = [] for phrase in audio: phrases.append(str(phrase)) return phrases
def get_phonemes_from_file(file_path): """ :param file_path: audio file (must be raw 16khz 16bit) :return: a list of phrases made of phonemes """ model_path = get_model_path() data_path = get_data_path() config = { 'verbose': False, 'audio_file': file_path, 'buffer_size': 2048, 'no_search': False, 'full_utt': False, 'hmm': os.path.join(model_path, 'en-us'), 'allphone': os.path.join(model_path, 'en-us/en-us-phone.lm.dmp'), 'beam': 1e-20, 'pbeam': 1e-20, 'lw': 2.0 } audio = AudioFile(**config) phrases = [] for phrase in audio: phrases.append(str(phrase)) return phrases
def recognize(wav_file): #BASE_PATH = os.path.dirname(os.path.realpath(__file__)) #HMDIR = os.path.join(BASE_PATH, "hmm") #LMDIR = os.path.join(BASE_PATH, "lm/en-us.lm.bin") #DICTD = os.path.join(BASE_PATH, "dict/en_in.dic") sound = "try1.wav" model_path = get_model_path() data_path = get_data_path() config = DefaultConfig() config.set_string('-hmm', "hmm/") config.set_string('-lm', 'lm\en-us.lm.bin') config.set_string('-dict', 'dict\en_in.dic') #decoder = Decoder(config) """ Run speech recognition on a given file. """ speech_rec = Decoder(config) print("Decoder Initialized") wav_file = wave.open(wav_file, 'rb') print("AudioFile Loaded") speech_rec.decode_raw(wav_file) print("Audio file decoded") result = speech_rec.get_hyp() print("Result Ready\n") return result
def __init__(self, mode): # state self.micbuf = np.zeros((0, 4), 'uint16') self.outbuf = None self.buffer_stuff = 0 self.mode = mode self.playchan = 0 self.playsamp = 0 # check mode if not (mode == "echo" or mode == "record" or mode == "record4"): error("argument not recognised") # robot name topic_base_name = "/" + os.getenv("MIRO_ROBOT_NAME") # publish topic = topic_base_name + "/control/stream" print ("publish", topic) self.pub_stream = rospy.Publisher(topic, Int16MultiArray, queue_size=0) # subscribe topic = topic_base_name + "/sensors/stream" print ("subscribe", topic) self.sub_stream = rospy.Subscriber(topic, UInt16MultiArray, self.callback_stream, queue_size=1, tcp_nodelay=True) # subscribe topic = topic_base_name + "/sensors/mics" print ("subscribe", topic) self.sub_mics = rospy.Subscriber(topic, Int16MultiArray, self.callback_mics, queue_size=5, tcp_nodelay=True) # report print "recording from 4 microphones for", RECORD_TIME, "seconds..." ####### Speech Recongnition using Pocket-Sphinx ######### model_path = get_model_path() data_path = get_data_path() config = { 'hmm' : os.path.join(model_path, 'en-us'), # Hidden Markov Model, Speech Recongnition model - trained probability scoring system 'lm': os.path.join(model_path, 'en-us.lm.bin'), #language model 'dict' : os.path.join(model_path, 'cmudict-en-us.dict') # language dictionary } ps = Pocketsphinx(**config) ps.decode( audio_file=("/tmp/input.wav"), #add temp input.wav file buffer_size=2048, no_search= False, full_utt=False) print("Recognized: ") print((ps.hypothesis())) ## output print("END")
def __init__(self): # state self.micbuf = np.zeros((0, 4), 'uint16') self.spkrbuf = None self.buffer_stuff = 0 # robot name topic_base = "/" + os.getenv("MIRO_ROBOT_NAME") + "/" # publish topic = topic_base + "control/stream" print(("publish", topic)) self.pub_stream = rospy.Publisher(topic, Int16MultiArray, queue_size=0) # subscribe topic = topic_base + "sensors/stream" print(("subscribe", topic)) self.sub_stream = rospy.Subscriber(topic, UInt16MultiArray, self.callback_stream) # subscribe topic = topic_base + "sensors/mics" print(("subscribe", topic)) self.sub_mics = rospy.Subscriber(topic, Int16MultiArray, self.callback_mics) # report print("recording on 4 microphone channels...") ####### Speech Recongnition using Pocket-Sphinx ######### #obtain audio from microphone # r = sr.Recognizer() # with sr.callback_mics() as source: # print("Say Hello") # audio = r.listen(source) #write audio as a wav file # with open("./tmp/input.wav", "wb") as f: # f.write(audio.get_wav_data()) model_path = get_model_path() data_path = get_data_path() config = { 'hmm': os.path.join( model_path, 'en-us' ), # Hidden Markov Model, Speech Recongnition model - trained probability scoring system 'lm': os.path.join(model_path, 'en-us.lm.bin'), #language model 'dict': os.path.join(model_path, 'cmudict-en-us.dict') # language dictionary }
def getPockerSphinxDecoder(): model_path = get_model_path() data_path = get_data_path() config = { 'verbose': False, 'hmm': os.path.join(model_path, 'en-us'), 'lm': os.path.join(model_path, 'en-us.lm.bin'), 'dict': os.path.join(model_path, 'cmudict-en-us.dict') } return Pocketsphinx(**config)
def getPockerSphinxDecoder(): model_path = get_model_path() data_path = get_data_path() config = { 'verbose': False, 'hmm': os.path.join(model_path, 'en-us'), 'lm': os.path.join(model_path, 'en-us.lm.bin'), 'dict': os.path.join(model_path, 'cmudict-en-us.dict'), # 'topn': 2, # 'ds':2, # 'maxwpf': 5, # 'maxhmmpf': 3000 } return Pocketsphinx(**config)
def __init__(self): model_path = get_model_path() print(model_path) data_path = get_data_path() config = { 'hmm' : os.path.join(model_path, 'en-us'), # Hidden Markov Model, Speech Recongnition model - trained probability scoring system 'lm': os.path.join(model_path, 'en-us.lm.bin'), #language model 'dict' : os.path.join(model_path, 'testdict.dict')#, # language dictionary } #Start PocketSphinx Deocde self.ps = Pocketsphinx(**config) # Variables for Audio self.micbuf = np.zeros((0, 4), 'uint16') self.outbuf = None self.buffer_stuff = 0 self.audio_level = 0 self.timeofclap = 0 self.playchan = 0 self.playsamp = 0 self.startTime = 0 self.TimeSinceLast = 0 self.DemoPause = False self.PID = '' self.velocity = TwistStamped() # Variables for Illumination self.illum = UInt32MultiArray() self.illum.data = [0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF] self.illumInt = 0 self.illumState = 0 # robot name topic_base_name = "/" + os.getenv("MIRO_ROBOT_NAME") #Publisher for Illum to control LED's while we are processing requests topic = topic_base_name + "/control/illum" self.pub_illum = rospy.Publisher(topic, UInt32MultiArray, queue_size=0) self.velocity_pub = rospy.Publisher(topic_base_name + "/control/cmd_vel", TwistStamped, queue_size=0) # subscribe topic = topic_base_name + "/sensors/mics" self.sub_mics = rospy.Subscriber(topic, Int16MultiArray, self.callback_mics, queue_size=1, tcp_nodelay=True)
# Code retested by KhalsaLabs # You can use your own audio file in code # Raw or wav files would work perfectly # For mp3 files, you need to modify code (add codex) from __future__ import print_function import os from pocketsphinx import Pocketsphinx, get_model_path, get_data_path model_path = get_model_path() data_path = get_data_path() config = { 'hmm': os.path.join(model_path, 'en-us'), 'lm': os.path.join(model_path, 'en-us.lm.bin'), 'dict': os.path.join(model_path, 'cmudict-en-us.dict') } ps = Pocketsphinx(**config) ps.decode( audio_file=os.path.join(data_path, 'test1.wav'), # add your audio file here buffer_size=2048, no_search=False, full_utt=False) print(ps.hypothesis())
#Python3 from pocketsphinx import get_model_path, get_data_path from os import path MODELDIR = get_model_path() DATADIR = get_data_path() LOGS_DIR = "logs" KEYPHRASES_DIR = path.join(DATADIR, 'keyphrases') POCKET_DICTIONARY = path.join(MODELDIR, 'en-us/cmudict-en-us.dict') POCKET_LANGUAGE_MODEL = path.join(MODELDIR, 'en-us/en-us.lm.bin') POCKET_HMM_ACOUSTIC_MODEL = path.join(MODELDIR, 'en-us') # Not yet implemented # POCKET_KEYPHRASES = path.join(KEYPHRASES_DIR, "keyphrases.txt") # POCKET_LOG = path.join(LOGS_DIR, 'pocketsphinx-listen.log') PROFILE_LOCATION = "custom" PROFILE_FILE = path.join(PROFILE_LOCATION, "profile.json") # Dict profile containing data global user_profile AUDIO_DIR = "res/audio"
from pocketsphinx import LiveSpeech, get_model_path, get_data_path modeldir = 'models/' datadir = get_data_path() includedir = 'include/' SOUND_DIR = 'sounds/' CONFIG_FILE = 'config.conf' import sys, os sys.path.append(os.path.join(includedir)) from pocketsphinx.pocketsphinx import * from sphinxbase.sphinxbase import * import pyaudio from include.WindowsInput import * def setup(): global config, conf_speak config = Decoder.default_config() config.set_string('-hmm', os.path.join(modeldir, 'en-us')) config.set_string('-dict', os.path.join(modeldir, 'cmudict-en-us.dict')) config.set_string('-kws', 'numbers.conf') config.set_string('-logfn', 'nul') #config.set_string('-keyphrase', 'r') #config.set_float('-kws_threshold', 1e-10) conf_speak = True try: conf = open(CONFIG_FILE, 'r') conf_speak = True if conf.readline().split("=")[1].replace( " ", "") == "true" else False # speak
def listenRoutine(self): r = sr.Recognizer() for each_file in files: print(each_file) model_path = get_model_path() data_path = get_data_path() config = { 'verbose': False, 'audio_file': os.path.join(os.getcwd(), 'audioFiles', each_file), 'buffer_size': 2048, 'no_search': False, 'full_utt': False, 'hmm': os.path.join(model_path, 'en-us'), 'lm': os.path.join(os.getcwd(), "TAR9991/TAR9991/9991.lm"), 'dict': os.path.join(os.getcwd(), "TAR9991/TAR9991/9991.dic") } #print (config) audio = AudioFile(**config) for phrase in audio: print(phrase) with sr.AudioFile( os.path.join(os.getcwd(), "audioFiles", each_file)) as source2: recording = r.record(source2) print( r.recognize_google(recording, language="en-EN", show_all=True)) exit() if 0: eFile = sr.AudioFile(each_file) with eFile as source: audio = r.record(source) print(each_file, type(audio)) print( r.recognize_google(audio, language="en-EN", show_all=True)) #print(r.recognize_sphinx(audio, grammar="TAR9991/TAR9991/")) exit() print("\r\n\r\n*****\r\nr", r) list_text = [ 'a lumpy', 'hey Lumpy', 'lamp', 'Halen', 'Hayden', 'listen', 'Listen', 'Lampe', 'lampe' ] stop_flag = True duration = 5 while (stop_flag): config = { 'color': { 'hue': self.hue, 'saturation': self.saturation }, 'brightness': self.brightness, 'on': self.on_off, 'client': 'local' } print(" - mqtt saved:", config) our_device = getaudiodevices() print("Detected our mic:", our_device) with sr.Microphone(device_index=our_device, sample_rate=48000) as source: print("Microphone source:", source, source.__dict__.keys(), source.device_index) print(" - Call lampi (", duration, "seconds ) ...") print("Set minimum energy threshold to {}".format( r.energy_threshold)) r.adjust_for_ambient_noise(source) audio_data = r.record(source, duration=duration) #print(type(audio_data)) filename = "pre_filtered_" + datetime.now().strftime( "%H:%M:%S") + ".wav" with open(filename, "wb") as audio_file: audio_file.write(audio_data.get_wav_data()) exit() #print(" - Recognizing...") # convert speech to text #text = r.recognize_google(audio_data) try: text = r.recognize_google(audio_data, language="en-EN") print(" - heard: ", text) text = text.split(" ") for item in text: #print(list_text[i]) if item in list_text: print(" - LAMPI detected") pygame.init() pygame.mixer.music.load('this_is_lampi.mp3') pygame.mixer.music.play() time.sleep(3) pygame.mixer.music.fadeout(5) #stop_flag = False self.commandRoutine() break except: print(" - no word recognized!")
def recognize_sphinx(audio_data, keyword_entries=None, grammar=None): """ Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using CMU Sphinx. The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` or ``"en-GB"``, defaulting to US English. Out of the box, only ``en-US`` is supported. See `Notes on using `PocketSphinx <https://github.com/Uberi/speech_recognition/blob/master/reference/pocketsphinx.rst>`__ for information about installing other languages. This document is also included under ``reference/pocketsphinx.rst``. The ``language`` parameter can also be a tuple of filesystem paths, of the form ``(acoustic_parameters_directory, language_model_file, phoneme_dictionary_file)`` - this allows you to load arbitrary Sphinx models. If specified, the keywords to search for are determined by ``keyword_entries``, an iterable of tuples of the form ``(keyword, sensitivity)``, where ``keyword`` is a phrase, and ``sensitivity`` is how sensitive to this phrase the recognizer should be, on a scale of 0 (very insensitive, more false negatives) to 1 (very sensitive, more false positives) inclusive. If not specified or ``None``, no keywords are used and Sphinx will simply transcribe whatever words it recognizes. Specifying ``keyword_entries`` is more accurate than just looking for those same keywords in non-keyword-based transcriptions, because Sphinx knows specifically what sounds to look for. Sphinx can also handle FSG or JSGF grammars. The parameter ``grammar`` expects a path to the grammar file. Note that if a JSGF grammar is passed, an FSG grammar will be created at the same location to speed up execution in the next run. If ``keyword_entries`` are passed, content of ``grammar`` will be ignored. Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the Sphinx ``pocketsphinx.pocketsphinx.Decoder`` object resulting from the recognition. Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if there are any issues with the Sphinx installation. """ assert isinstance(audio_data, AudioData), "``audio_data`` must be audio data" assert keyword_entries is None or all( isinstance(keyword, (type(""), type(u""))) and 0 <= sensitivity <= 1 for keyword, sensitivity in keyword_entries ), "``keyword_entries`` must be ``None`` or a list of pairs of strings and numbers between 0 and 1" # import the PocketSphinx speech recognition module try: from pocketsphinx import pocketsphinx, Jsgf, FsgModel, get_model_path, get_data_path except ImportError: raise RequestError( "missing PocketSphinx module: ensure that PocketSphinx is set up correctly." ) except ValueError: raise RequestError( "bad PocketSphinx installation; try reinstalling PocketSphinx version 0.0.9 or better." ) if not hasattr(pocketsphinx, "Decoder") or not hasattr( pocketsphinx.Decoder, "default_config"): raise RequestError( "outdated PocketSphinx installation; ensure you have PocketSphinx version 0.0.9 or better." ) # create decoder object model_path = get_model_path() data_path = get_data_path() # Create a decoder with certain model config = pocketsphinx.Decoder.default_config() config.set_string( '-hmm', os.path.join(model_path, 'en-us'), ) config.set_string('-lm', os.path.join(model_path, 'en-us.lm.bin')) config.set_string('-dict', os.path.join(model_path, 'cmudict-en-us.dict')) config.set_string( "-logfn", os.devnull ) # disable logging (logging causes unwanted output in terminal) decoder = pocketsphinx.Decoder(config) # obtain audio data raw_data = audio_data.get_raw_data( convert_rate=16000, convert_width=2 ) # the included language models require audio to be 16-bit mono 16 kHz in little-endian format # obtain recognition results if keyword_entries is not None: # explicitly specified set of keywords with PortableNamedTemporaryFile("w") as f: # generate a keywords file - Sphinx documentation recommendeds sensitivities between 1e-50 and 1e-5 f.writelines("{} /1e{}/\n".format(keyword, 100 * sensitivity - 110) for keyword, sensitivity in keyword_entries) f.flush() # perform the speech recognition with the keywords file (this is inside the context manager so the file isn;t deleted until we're done) decoder.set_kws("keywords", f.name) decoder.set_search("keywords") elif grammar is not None: # a path to a FSG or JSGF grammar if not os.path.exists(grammar): raise ValueError("Grammar '{0}' does not exist.".format(grammar)) grammar_path = os.path.abspath(os.path.dirname(grammar)) grammar_name = os.path.splitext(os.path.basename(grammar))[0] fsg_path = "{0}/{1}.fsg".format(grammar_path, grammar_name) if not os.path.exists(fsg_path): # create FSG grammar if not available jsgf = Jsgf(grammar) rule = jsgf.get_rule("{0}.{0}".format(grammar_name)) fsg = jsgf.build_fsg(rule, decoder.get_logmath(), 7.5) fsg.writefile(fsg_path) else: fsg = FsgModel(fsg_path, decoder.get_logmath(), 7.5) decoder.set_fsg(grammar_name, fsg) decoder.set_search(grammar_name) decoder.start_utt() # begin utterance processing decoder.process_raw( raw_data, False, True ) # process audio data with recognition enabled (no_search = False), as a full utterance (full_utt = True) decoder.end_utt() # stop utterance processing return decoder
def loop(self): # loop while not rospy.core.is_shutdown(): # if recording finished if not self.outbuf is None: # write output file print("writing output file") outfilename = '/tmp/input.wav' file = wave.open(outfilename, 'wb') file.setparams((1, 4, 20000, 0, 'NONE', 'not compressed')) print("Starting Reshape") x = np.reshape(self.outbuf[:, [0, 0]], (-1)) print("writing frames") print(len(x)) values = [] for s in x: packed_value = struct.pack('<h', s) values.append(packed_value) #file.writeframes(struct.pack('<h', s)) #close file value_str = b''.join(values) file.writeframes(value_str) print("Closing file") file.close() model_path = get_model_path() data_path = get_data_path() config = { 'hmm': os.path.join( model_path, 'en-us' ), # Hidden Markov Model, Speech Recongnition model - trained probability scoring system 'lm': os.path.join(model_path, 'en-us.lm.bin'), #language model 'dict': os.path.join( model_path, 'cmudict-en-us.dict') #, # language dictionary #'samprate' : 16000 } #cmd= "ffmpeg -y -i /tmp/output.wav -ar 8000 -af asetrate=16000*" + pitch + ",aresample=16000,atempo=" + tempo + " -ac 1 /tmp/outputConv.wav" #cmd = "ffmpeg -y -i /tmp/input.wav -f s32le -acodec pcm_s32le -ar 16000 -ac 1 /tmp/inputConv.wav" #cmd = "sox /tmp/input.wav -r 16000 inputConv.wav" #cmd = "ffmpeg -i /tmp/input.wav -ar 16000 /tmp/inputConv.wav" print("Converting via FFMPEG") cmd = "ffmpeg -y -i /tmp/input.wav -f s16le -acodec pcm_s16le -ar 16000 -af 'aresample=20000' -ac 1 /tmp/inputConv.wav -loglevel quiet" os.system(cmd) print("Decoding Via Pocketsphinx") ps = Pocketsphinx(**config) ps.decode( audio_file=( "/tmp/inputConv.wav"), #add temp input.wav file buffer_size=8192, no_search=False, full_utt=False) print("Recognized: ") print(ps.hypothesis()) ## output ## Speech Analysis, (what to start?) if ps.hypothesis() == "hello": mml.say("Hello there human") # Change this to whatever elif ps.hypothesis().find("how are you") >= 0: mml.say("I'm always good") print("END") self.micbuf = np.zeros((0, 4), 'uint16') self.outbuf = None self.buffer_stuff = 0 self.playchan = 0 self.playsamp = 0 # state time.sleep(0.02)
""" Constants, used by the app Attributes: _PACKAGE_LOCATION (str): location of the application WORKING_DIRECTORY (str): location of the working directory within the application SPEECH_MODEL_PATH (str): location of the speech recognition model SPEECH_DATA_PATH (str): location of the speech recognition data, used by the pocketsphinx library """ import os from pocketsphinx import get_data_path, get_model_path _PACKAGE_LOCATION = '/'.join(os.path.realpath(__file__).split('/')[:-2]) WORKING_DIRECTORY = _PACKAGE_LOCATION SPEECH_MODEL_PATH = get_model_path() SPEECH_DATA_PATH = get_data_path() if __name__ == '__main__': print(WORKING_DIRECTORY)