Пример #1
0
class SpeechDetector:
    def __init__(self):
        # Microphone stream config.
        self.CHUNK = 1024  # CHUNKS of bytes to read each time from mic
        self.FORMAT = pyaudio.paInt16
        self.CHANNELS = 1
        self.RATE = 16000

        self.SILENCE_LIMIT = 1  # Silence limit in seconds. The max ammount of seconds where
        # only silence is recorded. When this time passes the
        # recording finishes and the file is decoded

        self.PREV_AUDIO = 0.5  # Previous audio (in seconds) to prepend. When noise
        # is detected, how much of previously recorded audio is
        # prepended. This helps to prevent chopping the beginning
        # of the phrase.

        self.THRESHOLD = 3500
        self.num_phrases = -1

        self.MODULES = ["light", "time", "alarm", "remind", "song", "article"]
        self.STATES = [
            "tell", "on", "off", "set", "unset", "play", "stop", "use"
        ]
        self.light = Light()
        self.speaker = speaker.Speaker()
        self.song = Song(self.speaker)
        # These will need to be modified according to where the pocketsphinx folder is
        MODELDIR = "en-adapt"

        # Create a decoder with certain model
        config = Decoder.default_config()
        config.set_string('-hmm', os.path.join(MODELDIR, 'en-us-adapt'))
        config.set_string('-lm', os.path.join(MODELDIR, 'alfred/alfred.lm'))
        config.set_string('-dict', os.path.join(MODELDIR, 'alfred/alfred.dic'))

        # Creaders decoder object for streaming data.
        self.decoder = Decoder(config)

    def setup_mic(self, num_samples=50):
        """ Gets average audio intensity of your mic sound. You can use it to get
            average intensities while you're talking and/or silent. The average
            is the avg of the .2 of the largest intensities recorded.
        """
        print "Getting intensity values from mic."
        p = pyaudio.PyAudio()
        stream = p.open(format=self.FORMAT,
                        channels=self.CHANNELS,
                        rate=self.RATE,
                        input=True,
                        frames_per_buffer=self.CHUNK)

        values = [
            math.sqrt(abs(audioop.avg(stream.read(self.CHUNK), 4)))
            for x in range(num_samples)
        ]
        values = sorted(values, reverse=True)
        r = sum(values[:int(num_samples * 0.2)]) / int(num_samples * 0.2)
        print " Finished "
        print " Average audio intensity is ", r
        stream.close()
        p.terminate()

        if r < 5000:
            self.THRESHOLD = 5500
        else:
            self.THRESHOLD = r + 1000

    def save_speech(self, data, p):
        """
        Saves mic data to temporary WAV file. Returns filename of saved
        file
        """
        filename = 'output_' + str(int(time.time()))
        # writes data to WAV file
        data = ''.join(data)
        wf = wave.open(filename + '.wav', 'wb')
        wf.setnchannels(self.CHANNELS)
        wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
        wf.setframerate(
            self.RATE)  # TODO make this value a function parameter?
        wf.writeframes(data)
        wf.close()
        return filename + '.wav'

    def google_decode(self, filename):
        r = sr.Recognizer()

        with sr.AudioFile(filename) as source:
            audio = r.record(source)

        try:
            # for testing purposes, we're just using the default API key
            # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")`
            # instead of `r.recognize_google(audio)`
            text_from_speech = r.recognize_google(audio)
            print("Google Speech Recognition thinks you said " +
                  text_from_speech)
            return text_from_speech
            #process_text(text_from_speech)
        except sr.UnknownValueError:
            print("Google Speech Recognition could not understand audio")
        except sr.RequestError as e:
            print(
                "Could not request results from Google Speech Recognition service; {0}"
                .format(e))
        return None

    def mic_listen(self):
        """
        Listens to Microphone, extracts phrases from it and calls pocketsphinx
        to decode the sound
        """
        #self.setup_mic()

        #Open stream
        print(1)
        p = pyaudio.PyAudio()
        stream = p.open(format=self.FORMAT,
                        channels=self.CHANNELS,
                        rate=self.RATE,
                        input=True,
                        frames_per_buffer=self.CHUNK)
        print "* Mic set up and listening. "

        audio2send = []
        cur_data = ''  # current chunk of audio data
        rel = self.RATE / self.CHUNK
        slid_win = deque(maxlen=self.SILENCE_LIMIT * rel)
        #Prepend audio from 0.5 seconds before noise was detected
        prev_audio = deque(maxlen=self.PREV_AUDIO * rel)
        started = False

        while True:
            cur_data = stream.read(self.CHUNK)
            slid_win.append(math.sqrt(abs(audioop.avg(cur_data, 4))))
            if sum([x > self.THRESHOLD for x in slid_win]) > 0:
                if started == False:
                    print "Tell what you want to search"
                    started = True
                audio2send.append(cur_data)

            elif started:
                print "Finished recording, decoding phras"
                filename = self.save_speech(list(prev_audio) + audio2send, p)
                tempp_r = self.decode_phrase(filename)

                if not "search" in time_r:
                    self.speaker.say("say again")
                    continue
                r = self.google_decode(filename)

                os.remove(filename)

                stream.close()
                p.terminate()

                return r

            else:
                prev_audio.append(cur_data)

    def decode_phrase(self, wav_file):
        self.decoder.start_utt()
        stream = open(wav_file, "rb")
        while True:
            buf = stream.read(1024)
            if buf:
                self.decoder.process_raw(buf, False, False)
            else:
                break
        self.decoder.end_utt()
        words = []
        [words.append(seg.word) for seg in self.decoder.seg()]
        return words

    def remove_element(self, obj, array):
        pos = array.index(obj)
        array.pop(pos)

    def clean_transcribed_data(self, data):
        new_data = []
        for tag in data:
            if tag == "<sil>":
                continue

            elif tag == "<s>":
                continue

            elif tag == "[SPEECH]":
                continue

            elif tag == "</s>":
                continue

            elif tag == "[NOISE]":
                continue

            else:
                new_data.insert(len(new_data), tag)

        for tag in new_data:
            start_pos = tag.find("(")

            if start_pos == -1:
                continue

            pos = new_data.index(tag)
            tag = tag[:-3]
            new_data[pos] = tag

        for word in new_data:
            temp = word
            word = word.lower()
            pos = new_data.index(temp)

            new_data[pos] = word

        return new_data

    def load_module(self, tag, data):
        print(tag)
        if tag == "light":
            self.light.check_command(data)
        elif tag == "song":
            self.song.check_command(data)
        elif tag == "article":
            val = wiki.check_command(data, self.speaker)
            if val == True:
                r = self.mic_listen()
                if r == None:
                    self.speaker.say("repeat the query")
                    return
                wiki.handle(r, self.speaker)

    def check_for_modules(self, data):
        modules = self.MODULES

        for tag in data:
            if tag in modules:
                self.load_module(tag, data)

    def key_phrase_checker(self, data):
        data = self.clean_transcribed_data(data)
        print(data)

        if "alfred" in data:
            self.remove_element("alfred", data)
            self.check_for_modules(data)

    def run(self):
        """
        Listens to Microphone, extracts phrases from it and calls pocketsphinx
        to decode the sound
        """
        self.setup_mic()

        #Open stream
        p = pyaudio.PyAudio()
        stream = p.open(format=self.FORMAT,
                        channels=self.CHANNELS,
                        rate=self.RATE,
                        input=True,
                        frames_per_buffer=self.CHUNK)
        print "* Mic set up and listening. "

        audio2send = []
        cur_data = ''  # current chunk of audio data
        rel = self.RATE / self.CHUNK
        slid_win = deque(maxlen=self.SILENCE_LIMIT * rel)
        #Prepend audio from 0.5 seconds before noise was detected
        prev_audio = deque(maxlen=self.PREV_AUDIO * rel)
        started = False

        while True:
            cur_data = stream.read(self.CHUNK)
            slid_win.append(math.sqrt(abs(audioop.avg(cur_data, 4))))
            if sum([x > self.THRESHOLD for x in slid_win]) > 0:
                if started == False:
                    print "Starting recording of phrase"
                    started = True
                audio2send.append(cur_data)

            elif started:
                print "Finished recording, decoding phrase"
                filename = self.save_speech(list(prev_audio) + audio2send, p)
                r = self.decode_phrase(filename)

                self.key_phrase_checker(r)

                # Removes temp audio file
                os.remove(filename)
                # Reset all
                started = False
                slid_win = deque(maxlen=self.SILENCE_LIMIT * rel)
                prev_audio = deque(maxlen=0.5 * rel)
                audio2send = []
                print "Listening ..."

            else:
                prev_audio.append(cur_data)

        print "* Done listening"
        stream.close()
        p.terminate()