def evaluate(self, tokens): """ @see Service.evaluate() """ # Get the words, ready to match with words = self._words(tokens) # Look for these prefixes prefices = (('spell', ), ('how', 'do', 'you', 'spell')) match = None for prefix in prefices: try: # Look for the prefix and suffix in the words (start, end, score) = fuzzy_list_range(words, prefix) LOG.debug("%s matches %s with from %d to %d with score %d", prefix, words, start, end, score) # Get the best one if (start == 0 and (match is None or match[2] < score)): match = (start, end, score) except ValueError: pass # Did we get anything? if match is not None: # Simply give these to the handler (start, end, score) = match return _SpellingHandler(self, tokens, score / 100.0, words[end:]) else: # Nope, we got nothing return None
def evaluate(self, tokens): """ @see Service.evaluate() """ # Get the words, ready to match with words = self._words(tokens) # This is how it could be phrased fixes = ((('define', ), tuple()), (('what', 'is', 'the', 'meaning', 'of'), tuple()), (('what', 'does'), ('mean', ))) match = None for (prefix, suffix) in fixes: try: # Look for the prefix and suffix in the words if len(prefix) > 0: (pre_start, pre_end, pre_score) = fuzzy_list_range(words, prefix) else: (pre_start, pre_end, pre_score) = (0, 0, 100) if len(suffix) > 0: (suf_start, suf_end, suf_score) = fuzzy_list_range(words, suffix) else: (suf_start, suf_end, suf_score) = (len(words), len(words), 100) LOG.debug( "%s matches %s with from %d to %d with score %d, " "and %s matches from %d to %d with score %d", prefix, words, pre_start, pre_end, pre_score, suffix, suf_start, suf_end, suf_score) # We expect there to be only one word in the middle of the # prefix and suffix when we match if (pre_start == 0 and pre_end + 1 == suf_start and suf_end == len(words) and (match is None or match[2] < score)): match = (pre_start, pre_end, pre_score, suf_start, suf_end, suf_score) except ValueError: pass # Did we get anything? if match is not None: # Pull back the values (pre_start, pre_end, pre_score, suf_start, suf_end, suf_score) = match # The belief is the geometric distance of the scores belief = sqrt(pre_score * pre_score + suf_score * suf_score) / 100.0 # The word is the one at pre_end (since it's non-inclusive) word = words[pre_end] # And give back the handler return _DictionaryHandler(self, tokens, belief, word, self._limit) else: # Nope, we got nothing return None
def _build_from_dirname(self, dirname): ''' Build an index based on the given directory root. @type dirname: str @param dirname: The directory name to build from. ''' # Walk the tree for (subdir, subdirs, files) in os.walk(dirname): LOG.info("Indexing %s", subdir) # Handle all the files which we can find for filename in files: try: # Use mutagen to grab details path = os.path.join(subdir, filename) info = mutagen.File(path) if isinstance(info, mutagen.mp3.MP3): self._add_entry(AudioEntry.from_mp3(info)) elif isinstance(info, mutagen.flac.FLAC): self._add_entry(AudioEntry.from_flac(info)) else: LOG.debug("Ignoring %s", path) except Exception as e: LOG.warning("Failed to index %s: %s", path, e)
def _decode(self): """ @see AudioInput._decode() """ # Collect anything remaining self._add_result(self._recognizer.FinalResult()) # Ensure it's clear for next time self._recognizer.Reset() # Tokenize tokens = [] LOG.debug("Decoding: %s" % self._results) for result in self._results: word = result.get('word', '').strip() conf = result.get('conf', 0.0) if word and conf: tokens.append(Token(word, conf, True)) # Done self._results = [] # And give them all back LOG.debug("Got: %s" % ' '.join(str(i) for i in tokens)) return tokens
def run(self): """ The main worker. """ LOG.info("Starting the system") self._start() LOG.info("Entering main loop") while self._running: try: # Handle any events. First check to see if any time events are # pending and need to be scheduled. LOG.debug("Timer event queue length is %d", len(self._timer_events)) while len(self._timer_events) > 0 and \ self._timer_events[0].schedule_time <= time.time(): self._events.put(heapq.heappop(self._timer_events)) # Now handle the actual events while not self._events.empty(): event = self._events.get() try: result = event.invoke() if result is not None: self._events.put(result) except Exception as e: LOG.error("Event %s raised exception: %s", event, e) # Loop over all the inputs and see if they have anything pending for input in self._inputs: # Attempt a read, this will return None if there's nothing # available tokens = input.read() if tokens is not None: # Okay, we read something, attempt to handle it LOG.info("Read from %s: %s" % (input, [str(t) for t in tokens])) result = self._handle(tokens) # If we got something back then give it back to the user if result is not None: self._respond(result) # Wait for a bit before going around again time.sleep(0.1) except KeyboardInterrupt: LOG.warning("KeyboardInterrupt received") break # We're out of the main loop, shut things down LOG.info("Stopping the system") self._stop()
def evaluate(self, tokens): """ @see Service.evaluate() """ # Render to lower-case, for matching purposes. words = self._words(tokens) # Look for these types of queston prefices = (('what', 'is', 'a'), ('what', 'is', 'the'), ('what', 'is'), ('who', 'is', 'the'), ('who', 'is')) match = None for prefix in prefices: try: # Look for the prefix in the words (start, end, score) = fuzzy_list_range(words, prefix) LOG.debug("%s matches %s with from %d to %d with score %d", prefix, words, start, end, score) if start == 0 and (match is None or match[2] < score): match = (start, end, score) except ValueError: pass # If we got a good match then use it if match: (start, end, score) = match thing = ' '.join(words[end:]).strip().lower() # Let's look to see if Wikipedia returns anything when we search # for this thing best = None try: self._notify(Notifier.ACTIVE) for result in wikipedia.search(thing): if result is None or len(result) == 0: continue score = fuzz.ratio(thing, result.lower()) LOG.debug("'%s' matches '%s' with a score of %d", result, thing, score) if best is None or best[1] < score: best = (result, score) except Exception as e: LOG.error("Failed to query Wikipedia for '%s': %s" % (thing, e)) finally: self._notify(Notifier.IDLE) # Turn the words into a string for the handler if best is not None: return _Handler(self, tokens, best[1] / 100, best[0]) # If we got here then it didn't look like a query for us return None
def evaluate(self, tokens): """ @see Service.evaluate() """ words = self._words(tokens) for (what, handler) in self._HANDLERS: for prefix in self._PREFICES: phrase = (prefix + what) try: (s, e, _) = fuzzy_list_range(words, phrase) if s == 0 and e == len(phrase): return handler(self, tokens) except Exception as e: LOG.debug("Failed to handle '%s': %s" % (' '.join(words), e)) return None
def _add_result(self, json_result): """ Add in any result we have from the given JSON string. """ result = json.loads(json_result) LOG.debug("Got %s" % json_result) # See what we got, if anything if 'result' in result: # A full result, which is the best self._results.extend(result['result']) elif 'text' in result: # A decoded text string for word in result['text'].split(): if word: self._results.append({'word': word, 'conf': 1.0})
def _get_data(self): """ @see Handler.handle() """ # We'll want to cache the data since hammering PurpleAir is unfriendly # and also results in getting back no data. sensor_id = self.service.get_sensor_id() filename = '/tmp/dexter_purpleair_%s' % (sensor_id, ) now = time.time() content = None # Look for a cached version which is less and a minute old try: ctime = os.stat(filename).st_ctime if now - ctime < 60: with open(filename, 'rb') as fh: content = fh.read() except IOError: pass # If we didn't have a good cached version then download it if not content: h = httplib2.Http() resp, content = \ h.request("https://www.purpleair.com/json?show=%d" % (sensor_id,), "GET", headers={'content-type':'text/plain'} ) # Save what we downloaded into the cache try: with open(filename, 'wb') as fh: fh.write(content) except IOError: pass # Now load in whatever we had raw = json.loads(content) # And pull out the first value from the "results" section, which should # be what we care about if 'results' not in raw or len(raw['results']) == 0: return {} else: LOG.debug("Got: %s", (raw['results'][0], )) return raw['results'][0]
def _handler(self): """ Pulls values from the decoder queue and handles them appropriately. Runs in its own thread. """ LOG.info("Started decoding handler") while True: try: # Get a handle on the queue. This will be nulled out when we're # done. queue = self._decode_queue if queue is None: break # Anything? if len(queue) > 0: item = queue.popleft() if item is None: # A None denotes the end of the data so we look to # decode what we've been given LOG.info("Decoding audio") self._notify(Notifier.WORKING) self._output.append(self._decode()) self._notify(Notifier.IDLE) elif isinstance(item, bytes): # Something to feed the decoder LOG.debug("Feeding %d bytes" % len(item)) self._feed_raw(item) else: LOG.warning("Ignoring junk on decode queue: %r" % (item,)) # Go around again continue except Exception as e: # Be robust but log it LOG.error("Got an error in the decoder queue: %s" % (e,)) # Don't busy-wait time.sleep(0.001) # And we're done! LOG.info("Stopped decoding handler")
def evaluate(self, tokens): """ @see Service.evaluate() """ # The incoming request words = self._words(tokens) # Binary random number for phrase in ("toss a coin", "flip a coin"): try: fuzzy_list_range(words, phrase) return _CoinTossHandler(self, tokens) except ValueError: pass # A regular die for phrase in ("roll a die", "roll a dice"): try: fuzzy_list_range(words, phrase) return _DiceHandler(self, tokens, 6) except ValueError: pass # A generic request try: prefix = ('give', 'me', 'a', 'number', 'between') (_, offset, _) = fuzzy_list_range(words, prefix) if len(words) >= offset + 3: and_index = words.index('and') start = parse_number(words[offset :and_index]) end = parse_number(words[and_index+1:]) if start is not None and end is not None: return _RangeHandler(self, tokens, start, end) except Exception as e: LOG.debug("Failed to handle '%s': %s" % (phrase, e)) # Not for us return None
def _feed_raw(self, data): """ @see AudioInput._feed_raw() """ # Handle funy inputs if data is None or len(data) == 0: return # Don't let exceptions kill the thread try: # Connect? if self._sckt is None: # Connect and send the header information LOG.info("Opening connection to %s:%d" % ( self._host, self._port, )) self._sckt = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self._sckt.connect((self._host, self._port)) self._sckt.sendall(self._header) # Send off the chunk LOG.debug("Sending %d bytes of data to %s" % (len(data), self._host)) self._sckt.sendall(struct.pack('!q', len(data))) self._sckt.sendall(data) except Exception as e: # Don't kill the thread by throwing an exception, just grumble LOG.info("Failed to send to remote side: %s" % e) try: self._sckt.shutdown(socket.SHUT_RDWR) self._sckt.close() except: pass finally: self._sckt = None return
def _match_artist(self, artist): """ @see MusicService._match_artist() """ artist = ' '.join(artist).lower() LOG.debug("Matching artist '%s'", artist) result = self._spotify.search(artist, type='artist') if 'artists' in result and 'items' in result['artists']: items = result['artists']['items'] LOG.debug("Checking %d results", len(items)) for item in items: name = item.get('name', '').lower() LOG.debug("Matching against '%s'", name) if fuzz.ratio(name, artist) > 80: return True return False
def evaluate(self, tokens): """ @see Service.evaluate() """ # The incoming text words = self._words(tokens) # Look for the match phrases for (phrase, reply, is_prefix) in self._phrases: try: LOG.debug("Looking for %s in %s", phrase, words) (start, end, score) = fuzzy_list_range(words, phrase) LOG.debug("Matched [%d:%d] and score %d", start, end, score) if start == 0 and (not is_prefix or end == len(phrase)): return _BespokeHandler(self, tokens, reply) except ValueError as e: LOG.debug("No match: %s", e)
def _get_handler_for(self, tokens, platform_match, genre, artist, song_or_album): """ @see MusicService._get_handler_for() """ # Do nothing if we have no name if song_or_album is None or len(song_or_album) == 0: return None # Normalise to strings name = ' '.join(song_or_album).lower() if artist is None or len(artist) == 0: artist = None else: artist = ' '.join(artist).lower() # We will put all the track URIs in here uris = [] # Search by track name then album name, these are essentially the same # logic for which in ('track', 'album'): LOG.info("Looking for '%s'%s as a %s", name, " by '%s'" % artist if artist else '', which) # This is the key in the results plural = which + 's' # Try using the song_or_album as the name result = self._spotify.search(name, type=which) if not result: LOG.info("No results") continue # Did we get back any tracks if plural not in result: LOG.error("%s was not in result keys: %s", plural, result.keys()) continue # We got some results back, let's assign scores to them all results = result[plural] matches = [] for item in results.get('items', []): # It must have a uri if 'uri' not in item and item['uri']: LOG.error("No URI in %s", item) # Look at all the candidate entries if 'name' in item: # See if this is better than any existing match name_score = fuzz.ratio(name, item['name'].lower()) LOG.debug("'%s' matches '%s' with score %d", item['name'], name, name_score) # Check to make sure that we have an artist match as well if artist is None: # Treat as a wildcard artist_score = 100 else: artist_score = 0 for entry in item.get('artists', []): score = fuzz.ratio(artist, entry.get('name', '').lower()) LOG.debug("Artist match score for '%s' was %d", entry.get('name', ''), score) if score > artist_score: artist_score = score LOG.debug("Artist match score was %d", artist_score) # Only consider cases where the scores look "good enough" if name_score > 75 and artist_score > 75: LOG.debug("Adding match") matches.append((item, name_score, artist_score)) # Anything? if len(matches) > 0: LOG.debug("Got %d matches", len(matches)) # Order them accordingly matches.sort(key=lambda e: (e[1], e[2])) # Now, pick the top one best = matches[0] item = best[0] LOG.debug("Best match was: %s", item) # Extract the info item_name = item.get('name', None) or name artists = item.get('artists', []) artist_name = (artists[0].get('name', None) if len(artists) > 0 else None) or artist # Description of what we are playing what = item_name if item_name else name if artist_name: what += " by " + artist_name what += " on Spotify" # The score is the geometric value of the two score = sqrt(best[1] * best[1] + best[2] * best[2]) / 100.0 # The should be here assert 'uri' in item, "Missing URI in %s" % (item, ) uri = item['uri'] # If we are an album then grab the track URIs if which == 'album': tracks = self._spotify.album_tracks(uri) if tracks and 'items' in tracks: uris = [track['uri'] for track in tracks['items']] else: # Just the track uris = [uri] # And we're done break # Otherwise assume that it's an artist if len(uris) == 0 and artist is None: LOG.info("Looking for '%s' as an artist", name) result = self._spotify.search(name, type='artist') LOG.debug("Got: %s", result) if result and 'artists' in result and 'items' in result['artists']: items = sorted(result['artists']['items'], key=lambda entry: fuzz.ratio( name, entry.get('name', '').lower()), reverse=True) # Look at the best one, if any LOG.debug("Got %d matches", len(items)) if len(items) > 0: match = items[0] who = match['name'] what = "%s on Spotify" % (who, ) score = fuzz.ratio(who.lower(), name) # Find all their albums if 'uri' in match: LOG.debug("Got match: %s", match['uri']) artist_albums = self._spotify.artist_albums( match['uri']) for album in artist_albums.get('items', []): # Append all the tracks LOG.debug("Looking at album: %s", album) if 'uri' in album: tracks = self._spotify.album_tracks( album['uri']) if tracks and 'items' in tracks: LOG.debug( "Adding tracks: %s", ' '.join(track['name'] for track in tracks['items'])) uris.extend([ track['uri'] for track in tracks['items'] ]) # And now we can give it back, if we had something if len(uris) > 0: return _SpotifyServicePlayHandler(self, tokens, what, uris, score) else: # We got nothing return None
def _run(self): ''' Reads from the audio input stream and hands it off to be processed. ''' # This possibly takes a while so tell the system what we're doing. self._notify(Notifier.INIT) # The number of read calls which we expect per second. This corresponds # to many entries in a buffer constitute a second's worth of data. read_rate = self._rate / self._chunk_size # The buffer of historical audio data audio_buf = deque(maxlen=int(1.0 * read_rate)) level_buf = deque(maxlen=int(2.0 * read_rate)) # The index at which we cut the level buffer for the purposes of looking # for a change in the audio going from background to noisy, or vice # versa. This is what we are looking for when detecting speech. avg_idx = level_buf.maxlen // 3 # Start pulling in the audio stream p = pyaudio.PyAudio() stream = p.open(format=self._format, channels=self._channels, rate=self._rate, input=True, frames_per_buffer=self._chunk_size) # State etc. talking = None # True when we have detect talking speech = None # What we will process as speech data min_secs = 4 max_secs = 10 last_log = 0 # Keep listening until we are stopped while self.is_running: # We'll need this here and there below now = time.time() # Read in the next lump of data and get its average volume chunk = stream.read(self._chunk_size, exception_on_overflow=False) level = numpy.sqrt(abs(audioop.avg(chunk, self._width))) # Accumulate into our buffers audio_buf.append(chunk) level_buf.append(level) # If we have not yet filled up the level buffer then we're done # here. Any analysis etc. will be inaccurate. if len(level_buf) != level_buf.maxlen: continue # Get the averaging window as a numpy array so that we can cut it # and so forth levels = numpy.array(level_buf) # Determine the background level of sound. We only do this if we # don't think that anyone is talking. If we are doing this for the # first time then we can note that we have become actively # listening. if talking is None: LOG.info("Listening") self._notify(Notifier.IDLE) talking = False talking_start = 0 # Only look to see if someone is speaking if the system is # not. Otherwise we will likely hear ourselves. if self._state.is_speaking() and talking: LOG.info("Ignoring talking since audio is being output") talking = False speech = None continue # We are looking for the background levels. If we think that someone # is talking then the background sound going to be at the end of the # levels, else it will be at the start. # Different detection based on what we are looking for if not talking: # Looking for a step up in the latter part from_levels = levels[:-avg_idx] # From start to avg_idx to_levels = levels[-avg_idx:] # From avg_idx to end from_pctl = numpy.sort(from_levels)[int( len(from_levels) * 0.5)] to_pctl = numpy.sort(to_levels)[int(len(to_levels) * 0.6)] LOG.debug("Levels are from=%0.2f to=%0.2f", from_pctl, to_pctl) if from_pctl * 1.5 < to_pctl: LOG.info("Detected start of speech " "with levels going from %0.2f to %0.2f" % (from_pctl, to_pctl)) talking = True talking_start = now start_pctl = from_pctl else: # Looking for a step down in the latter part from_levels = levels[:avg_idx] # From start to avg_idx to_levels = levels[avg_idx:] # From avg_idx to end from_pctl = numpy.sort(from_levels)[int( len(from_levels) * 0.5)] to_pctl = numpy.sort(to_levels)[int(len(to_levels) * 0.5)] if now - last_log > 0.2: LOG.info("Levels are from=%0.2f to=%0.2f", from_pctl, to_pctl) last_log = now else: LOG.debug("Levels are from=%0.2f to=%0.2f", from_pctl, to_pctl) if (now - talking_start > min_secs and (from_pctl > to_pctl * 1.25 or to_pctl < start_pctl * 1.1)): LOG.info("Detected end of speech " "with levels going from %0.2f to %0.2f" % (from_pctl, to_pctl)) talking = False # If the talking has been going on too long then just stop it. Quite # possibly the capture was fooled. if talking and now - talking_start > max_secs: LOG.info("Talking lasted over %ds; pushing to False" % max_secs) talking = False # Different behaviour depending on whether we think someone is # talking or not if talking: # If we don't yet have any audio then we're starting the # recording if speech is None: # Move the rolling window of recording to be the start of # the audio LOG.info("Starting recording") self._notify(Notifier.ACTIVE) speech = list(audio_buf) # Add on what we just recorded speech.append(chunk) # We deem that talking is still happening if it started only a # little while ago elif speech is not None: # There's no talking but there is recorded audio. That means # someone just stopped talking. LOG.info("Finished recording") # Turn the audio data into text (hopefully!) self._notify(Notifier.WORKING) start = time.time() # Turn the stream into a list of bytes and junk the speech # buffer audio = b''.join(speech) speech = None # Maybe save then as a wav file self._save_bytes(audio) # Now decode LOG.info("Decoding %0.2fs seconds of audio" % (len(audio) / self._width / self._rate)) tokens = self._decode_raw(audio) LOG.info("Decoded audio in %0.2fs: %s" % (time.time() - start, ([str(x) for x in tokens]))) # Add then to the output self._output.append(tokens) # Flush anything accumulated while we were parsing the phrase, # so that we don't fall behind available = stream.get_read_available() while (available > self._chunk_size): LOG.debug("Junking backlog of %d", available) stream.read(available) available = stream.get_read_available() # Clear out the level buffer so that it can settle again level_buf.clear() # And we're back to listening LOG.info("Listening") self._notify(Notifier.IDLE) # If we got here then _running was set to False and we're done LOG.info("Done listening") stream.close() p.terminate()
def _run(self): """ Reads from the audio input stream and hands it off to be processed. """ # This possibly takes a while so tell the system what we're doing. self._notify(Notifier.INIT) # The number of read calls which we expect per second. This corresponds # to many entries in a buffer constitute a second's worth of data. read_rate = self._rate / self._chunk_size # The buffer of historical audio data audio_buf = deque(maxlen=int(1.0 * read_rate)) level_buf = deque(maxlen=int(2.0 * read_rate)) # The index at which we cut the level buffer for the purposes of looking # for a change in the audio going from background to noisy, or vice # versa. This is what we are looking for when detecting speech. avg_idx = level_buf.maxlen // 3 # Start pulling in the audio stream p = pyaudio.PyAudio() stream = p.open(format =self._format, channels =self._channels, rate =self._rate, input =True, frames_per_buffer=self._chunk_size) # State talking = None # True when we have detect talking speech = None # What we will process as speech data # Limits on recording min_secs = 2 # <-- Enough for the key-phrase only max_secs = 10 # <-- Plenty? # Init is done, we start off idle self._notify(Notifier.IDLE) # Keep listening until we are stopped while self.is_running: # We'll need this here and there below now = time.time() # Read in the next lump of data and get its volume. It looks like # rms() is the the best measure of this but I could be wrong. chunk = stream.read(self._chunk_size, exception_on_overflow=False) level = abs(audioop.rms(chunk, self._width)) # Accumulate into our buffers audio_buf.append(chunk) level_buf.append(level) # If we have not yet filled up the level buffer then we're done # here. Any analysis etc. will be inaccurate. if len(level_buf) != level_buf.maxlen: continue # Get the averaging window as a numpy array so that we can cut it # and so forth levels = numpy.array(level_buf) # Determine the background level of sound. We only do this if we # don't think that anyone is talking. If we are doing this for the # first time then we can note that we have become actively # listening. if talking is None: LOG.info("Listening") self._notify(Notifier.IDLE) talking = False talking_start = 0 # We are looking for the background levels. If we think that someone # is talking then the background sound going to be at the end of the # levels, else it will be at the start. # Get the median level as we transition from_levels = levels[ :-avg_idx] # From start to avg_idx to_levels = levels[-avg_idx: ] # From avg_idx to end from_median = numpy.sort(from_levels)[int(len(from_levels) * 0.5)] to_median = numpy.sort(to_levels )[int(len(to_levels ) * 0.5)] LOG.debug("Levels are from=%0.2f to=%0.2f", from_median, to_median) # Different detection based on what we are looking for if not talking: # Looking for a step up in the latter part if from_median * 1.5 < to_median: LOG.info("Detected start of speech " "with levels going from %0.2f to %0.2f" % (from_median, to_median)) talking = True talking_start = now start_median = from_median else: # Looking for a step down in the latter part if (now - talking_start > min_secs and (from_median > to_median * 1.25 or to_median < start_median * 1.1)): LOG.info("Detected end of speech " "with levels going from %0.2f to %0.2f" % (from_median, to_median)) talking = False # If the talking has been going on too long then just stop it. Quite # possibly the capture was fooled. if talking and now - talking_start > max_secs: LOG.info("Talking lasted over %ds; pushing to False" % max_secs) talking = False # Different behaviour depending on whether we think someone is # talking or not if talking: # If we don't yet have any audio then we're starting the # recording if speech is None: # Move the rolling window of recording to be the start of # the audio LOG.info("Starting recording") self._notify(Notifier.ACTIVE) speech = [] # Push in everything that we have so far for prev in audio_buf: speech.append(prev) self._decode_queue.append(prev) # Add on what we just recorded speech.append(chunk) self._decode_queue.append(chunk) # We deem that talking is still happening if it started only a # little while ago elif speech is not None: # There's no talking but there is recorded audio. That means # someone just stopped talking. LOG.info("Finished recording") # Turn the stream into a list of bytes and junk the speech # buffer audio = b''.join(speech) speech = None # Maybe save then as a wav file self._save_bytes(audio) # Now decode. We do this by denoting the end of the audio with a None. self._decode_queue.append(None) # Clear out the level buffer so that it can settle again level_buf.clear() # And we're back to listening LOG.info("Listening") # If we got here then _running was set to False and we're done LOG.info("Done listening") self._decode_queue = None stream.close() p.terminate()
def fuzzy_list_range(list_, sublist, start =0, threshold =80, homonize_words=True): """ Find the slice range of a sublist of strings within a list, using fuzzy matching. :type list_: list<str> or tuple<str> :param list_: The list to look in. :type sublist: list<str> or tuple<str> :param sublist: The list to look for. :type start: int :param start: Where to start looking in the C{list}. :type threshold: int :param threshold: The fuzzy matching percentage threshold which the sublist must match with. :type homoize_words: bool :param homoize_words: Whether to homonize the words before fuzzing. :rtype: tuple :return: A tuple of C{start, end, score} where start and end are a half-inclusive slice and score is the matching score. >>> fuzzy_list_range('whot is a fash'.split(' '), 'a fish'.split(' ')) (2, 4, 83) >>> fuzzy_list_range(['what', 'is', 'a', 'fish'], ('whit', 'is')) (0, 2, 86) >>> fuzzy_list_range(['format', 'c', 'colon'], ('format', 'sea', 'colon')) (0, 3, 100) """ # Sanity if list_ is None: raise ValueError("list was None") if sublist is None: raise ValueError("sublist was None") # The empty list can't be in anything if len(sublist) == 0: raise ValueError("Empty sublist not in list") # Say what we got before normalisation occurs LOG.debug("Given '%s' to look for in '%s'", ' '.join(sublist), ' '.join(list_[start:])) # Since we're doing fuzzy matching let's make these into words def as_word(entry): """ Perform normalisation on the given word. """ try: value = float(entry) if value == int(value): value = number_to_words(int(value)) else: value = number_to_words(value) except: value = to_alphanumeric(entry.lower()) if homonize_words: value = homonize(value) return value # Turn the given lists into words subwords = tuple(as_word(e) for e in sublist) words = tuple(as_word(e) for e in list_ ) LOG.debug("Looking for '%s' in '%s'", ' '.join(subwords), ' '.join(words[start:])) # Look for the "best" match best = None # If we have a single thing then we have a simple case if len(words) == 1: # Extract it for simplicity query = subwords[0] # Look for an exact match first try: return words.index(query) except ValueError: pass # Find the first and bestest match for (index, entry) in enumerate(words): if index < start: continue score = fuzz.ratio(query, entry) if score >= threshold and (best is None or best[2] < score): best = (index, len(words), score) else: # We have a multi-element sublist, we are going to look for the best # matching sublist. This is going to be O(n^2) I'm afraid. query = ' '.join(subwords) for s in range(start, len(words)): for e in range(s + 1, len(words) + 1): phrase = ' '.join(words[s:e]) score = fuzz.ratio(query, phrase) LOG.debug("Checking '%s' in [%d:%d] '%s' gives %d", query, s, e, phrase, score) if score >= threshold and (best is None or best[2] < score): best = (s, e, score) # Did we get anything? if best is None: raise ValueError("'%s' not found in %s'" % (sublist, list_)) else: return best
def _pick(self): """ Choose a random fortune. This is the meat of this class. """ # We do this all from scratch each time since it's not _that_ expensive # and it means we don't have to restart anything when new files are # added. We have a list of filenames and the start and end of their data # as part of the total count. # # We are effectively concatenating the files here so as to avoid # bias. Consider: if you have two files, with one twice the size of the # other, if we picked a random fortune from a random file then then # fortunes in the smallee file would be twice as likely to come up as # ones in the bigger one. file_info = [] total_size = 0 for (subdir, _, files) in os.walk(self._dir, followlinks=True): for filename in files: # The fortune files have an associated .dat file, this means we # can identify them by looking for that .dat file. path = os.path.join(subdir, filename) dat_path = path + '.dat' LOG.debug("Candidate: %s %s", path, dat_path) if os.path.exists(dat_path): # Open it to make sure can do so try: with open(path, 'rt'): # Get the file length to use it to accumulate into # our running counter, and to compute the file- # specifc stats. stat = os.stat(path) # The start of the file is the current total_size # and the end is that plus the file size start = total_size total_size += stat.st_size end = total_size file_info.append((path, start, end)) LOG.debug("Adding %s[%d:%d]", path, start, end) except Exception as e: LOG.debug("Failed to add %s: %s", path, e) # Keep trying this until we get something, or until we give up. Most of # the time we expect this to work on the first go unless something weird # is going on. for tries in range(10): LOG.debug("Try #%d", tries) # Now that we have a list of files, pick one at random by choosing a # point somewhere in there offset = random.randint(0, total_size) LOG.debug("Picked offset %d", offset) # Now we look for the file which contains that offset for (filename, start, end) in file_info: if start <= offset < end: with open(filename, 'rt') as fh: # Jump to the appropriate point in the file, according to # the offset (relative to the files's start in the overall # set) seek_offset = offset - start if seek_offset > 0: fh.seek(seek_offset) try: # Now look for the bracketing '%'s. Read in a nice # big chunk and hunt for it in there. chunk = fh.read(min(10 * self._max_len, 1024 * 1024)) # The file could start with a bracketer and we want # to catch that if seek_offset == 0 and chunk.startswith('%\n'): s = 2 else: s = chunk.index('\n%\n') + 3 # Now look for the end. A properly-formed file # should have a '%\n' as its last line. e = chunk.index('\n%\n', s) # We found a match. Is it small enough? LOG.debug("Found section %s[%d:%d]", filename, s, e) if (e - s) > self._max_len: # Nope, go around and try again break else: # Yes! return chunk[s:e] except ValueError: # Find to match so give up and go around again break # If we got here then we gave up trying return None
def parse_number(words): ''' Turn a set of words into a number. These might be complex ("One thousand four hundred and eleven") or simple ("Seven"). >>> parse_number('one') 1 >>> parse_number('one point eight') 1.8 >>> parse_number('minus six') -6 >>> parse_number('minus four point seven eight nine') -4.789 @type words: str @parse words: The string words to parse. E.g. C{'twenty seven'}. ''' # Sanity if words is None: return None # Make sure it's a string words = str(words) # Trim surrounding whitespace words = words.strip() # Not a lot we can do if we have no words if len(words) == 0: return None # First try to parse the string as an integer or a float directly if not re.search(r'\s', words): try: return int(words) except: pass try: return float(words) except: pass # Sanitise it since we're now going to attempt to parse it. Collapse # multiple spaces to one and strip out non-letters words = ' '.join(to_letters(s) for s in re.split(r'\s+', words)) LOG.debug("Parsing '%s'" % (words, )) # Recheck for empty if words == '': return None # See if we have to negate the result mult = 1 for neg in ("minus ", "negative "): if words.startswith(neg): words = words[len(neg):] mult = -1 break # Look for "point" in the words since it might be "six point two" or # something if ' point ' in words: # Determine the integer and decimal portions (integer, decimal) = words.split(' point ', 1) LOG.debug("'%s' becomes '%s' and '%s'" % (words, integer, decimal)) # Parsing the whole number is easy enough whole = parse_number(integer) if whole is None: return None # S;plit up the digits to parse them. digits = numpy.array( [parse_number(digit) for digit in decimal.split(' ')]) if None in digits or \ numpy.any(digits < 0) or \ numpy.any(digits > 9): LOG.error("'%s' was not a valid decimal" % (words, )) return None # Okay, use some cheese to parse into a float return mult * float('%d.%s' % (whole, ''.join(str(d) for d in digits))) else: # No ' point ' in it, parse directly try: return mult * _WORDS_TO_NUMBERS.parse(words) except Exception as e: LOG.error("Failed to parse '%s': %s" % (words, e)) return None
def _handler(self): """ Pulls values from the decoder queue and handles them appropriately. Runs in its own thread. """ # Whether we are skipping the current input gobble = False LOG.info("Started decoding handler") while True: try: # Get a handle on the queue. This will be nulled out when we're # done. queue = self._decode_queue if queue is None: break # Anything? if len(queue) > 0: item = queue.popleft() if item is None: # A None denotes the end of the data so we look to # decode what we've been given if we're not throwing it # away. if gobble: LOG.info("Dropped audio") else: LOG.info("Decoding audio") self._notify(Notifier.WORKING) self._output.append(self._decode()) self._notify(Notifier.IDLE) elif isinstance(item, float): # This is the timestamp of the clip. If it's too old # then we throw it away. age = time.time() - item if int(age) > 0: LOG.info("Upcoming audio clip is %0.2fs old" % (age, )) gobble = age > self._GOBBLE_LIMIT elif isinstance(item, bytes): # Something to feed the decoder if gobble: LOG.debug("Ignoring %d bytes" % len(item)) else: LOG.debug("Feeding %d bytes" % len(item)) self._feed_raw(item) else: LOG.warning("Ignoring junk on decode queue: %r" % (item, )) # Go around again continue except Exception as e: # Be robust but log it LOG.error("Got an error in the decoder queue: %s" % (e, )) # Don't busy-wait time.sleep(0.001) # And we're done! LOG.info("Stopped decoding handler")