def record(self, audio_name: str, max_time: int): input(f"press any key to begin to record {max_time} seconds voice >> ") stream = PyAudio().open(format=paInt16, channels=self.CHANNEL_NUM, rate=self.SAMPLE_RATE, input=True, frames_per_buffer=self.SAMPLE_NUM) my_buf = [] time_start = time.time() last_second = 0 print(f"time: {last_second} s") while True: duration = time.time() - time_start if duration >= max_time: break if int(duration) != last_second: last_second = int(duration) print(f"time: {last_second} s") string_audio_data = stream.read(self.SAMPLE_NUM) my_buf.append(string_audio_data) stream.close() self._save_wave_file(audio_name, my_buf)
class Ava(AvaSkills): def __init__(self): super().__init__(self) self.interpreter = Interpreter.load(settings.RASA_MODEL_DIR) self.stream = PyAudio().open(format=paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024, output_device_index=0) self.config = pocketsphinx.Decoder.default_config() self.config.set_string( '-hmm', path.join(settings.SPHINX_MODEL_DIR, 'en-us/en-us')) self.config.set_string( '-dict', path.join(settings.SPHINX_MODEL_DIR, 'en-us/cmudict-en-us.dict')) self.config.set_string('-keyphrase', settings.WAKE_PHRASE) self.config.set_float('-kws_threshold', 1e+20) self.config.set_string('-logfn', 'text.log') self.decoder = pocketsphinx.Decoder(self.config) self.listen_for_wake() def listen_for_wake(self): self.stream.start_stream() self.decoder.start_utt() if (not self.play_mp3("startup_greeting.mp3")): self.get_tts( text= f"Hi, my name is Ava. If you need help, just say the wake command: {settings.WAKE_PHRASE}.", file_name="startup_greeting.mp3") print("Listening for wake word...") while True: buf = self.stream.read(1024) if buf: self.decoder.process_raw(buf, False, False) else: break if self.decoder.hyp() != None: print(f"Key phrase '{settings.WAKE_PHRASE}' detected...") if (not self.play_mp3("wake_chime.mp3")): exit() if (not self.play_mp3("wake_greeting.mp3")): self.get_tts(text="How can I help?", file_name="wake_greeting.mp3") self.listen_for_input() self.decoder.end_utt() print("Waiting for wakeup ...") self.decoder.start_utt() def get_tts(self, text, file_name, save=True): print("Converting text to speech...") polly_client = boto3.Session( aws_access_key_id=keys.POLLY_ACCESS_KEY_ID, aws_secret_access_key=keys.POLLY_SECRET_ACCESS_KEY, region_name='us-west-2').client('polly') response = polly_client.synthesize_speech(VoiceId='Joanna', OutputFormat='pcm', SampleRate="16000", Text=text) data = response['AudioStream'].read() self.play_byte(data) if save: print("Saving to mp3 file...") AudioSegment(data=data, sample_width=2, frame_rate=16000, channels=1).export(out_f=settings.MEDIA_DIR + file_name, format="mp3") def play_byte(self, stream): try: print("Playing byte stream...") play( AudioSegment(data=stream, sample_width=2, frame_rate=16000, channels=1)) except Exception as e: print("Error playing file...", e) def play_mp3(self, audio_file, save=True): file_path = settings.MEDIA_DIR + audio_file is_file = path.isfile(file_path) if is_file: try: print("Playing audio...") play(AudioSegment.from_mp3(file_path)) except Exception as e: print("Error playing file...", e) return False else: print("Audio file not found...") return False return True def listen_for_input(self): sr = speech_recognition.Recognizer() mic = speech_recognition.Microphone() hyp = None with mic as source: sr.adjust_for_ambient_noise(source) try: print("Listening...") audio = sr.listen(source, timeout=2) print("Decoding...") hyp = sr.recognize_google(audio) except speech_recognition.WaitTimeoutError as e: print("No input detected timeout...") except speech_recognition.UnknownValueError as e: if (not self.play_mp3("decode_error.mp3")): self.get_tts( text= "I'm sorry, but I was not able to understand that command.", file_name="decode_error.mp3") except speech_recognition.RequestError as e: print("Google request error: ", e) print("Running backup decode Sphinx...") try: hyp = sr.recognize_google(audio) except speech_recognition.UnknownValueError as e: print("Sphinx recognition error: ", e) if (not self.play_mp3("decode_error.mp3")): self.get_tts( text= "I'm sorry, but I was not able to understand that command.", file_name="decode_error.mp3") else: t0 = time() self.process_input_intent(hyp) print(f"Time to process command: {time() - t0}") def process_input_intent(self, hypothesis): print("Processing intent...") print(f"HYPOTHESIS:{hypothesis}") result = self.interpreter.parse(hypothesis) intent = result['intent']['name'] confidence = result['intent']['confidence'] print(f"INTENT: {intent}") print(f"CONFIDENCE_SCORE: {confidence}") try: print("Executing intent action...") response = getattr(self, intent)(result) print(f"RESULT: {response}") if (response): self.respond_intent_result(response) except Exception as e: print(f"Failed intent action...\n\tERROR: {e} ") def respond_intent_result(self, result): print(f"RESPONSE: {result['tts']}") if (not self.play_mp3(result['file'])): self.get_tts(text=result['tts'], file_name=result['file'], save=result['save'])
def audio_freq(): # Значення крайніх нот у моєму випадку (шестиструнна гітара в строї Ре) note_min = 60 # Нота До 4-ї октави note_max = 71 # Нота Сі 4-ї октави sample_freq = 22050 # Частота кадру в герцах # Від збільшення цих констант залежить швидкість оновлення частоти. frame_size = 2048 # Кількість зразків у кадрі frames_per_fft = 16 # Кількість кадрів для середнього значення ШПФ samples_per_fft = frame_size * frames_per_fft # Кількість зразків на ШПФ freq_step = sample_freq / samples_per_fft # Крок частоти # Отримання мінімального та максимального показника для наших нот в межах ШПФ. def note_to_fftbin(n): return 440 * 2.0 ** ((n - 69) / 12.0) / freq_step imin = max(0, int(numpy.floor(note_to_fftbin(note_min - 1)))) imax = min(samples_per_fft, int(numpy.ceil(note_to_fftbin(note_max + 1)))) # Визначення простору для ШПФ. buf = numpy.zeros(samples_per_fft, dtype=numpy.float32) # Функція вікна Хеннінга. window = 0.5 * (1 - numpy.cos(numpy.linspace(0, 2*numpy.pi, samples_per_fft, False))) # Відкриваємо аудіо потік. stream = PyAudio().open(format=paInt16, channels=1, rate=sample_freq, input=True, frames_per_buffer=frame_size) stream.start_stream() # Отримуємо дані, поки потік відкритий. while stream.is_active(): # Оновлюємо буфер та приймаємо нові дані. buf[:-frame_size] = buf[frame_size:] buf[-frame_size:] = numpy.fromstring(stream.read(frame_size), numpy.int16) # Запускаємо ШПФ в буфері в межах вікна. fft = numpy.fft.rfft(buf * window) # Отримуємо максимально повторювану частоту в діапазоні. freq = (numpy.abs(fft[imin:imax]).argmax() + imin) * freq_step # Запис відображення ноти (get_note) у файл freqs.txt freq_save(get_note(freq)) # Правильно закриваємо аудіо потік. stream.stop_stream() stream.close() stream.terminate()
class SoundData: def __init__(self, chunk=1024, rate=44100): ''' Initialize a SoundData object. Args: chunk (int) : number of samples grouped together default: 1024 rate (int) : sampling frequency in Hz default: 44100 ''' self.chunk = chunk self.rate = rate self.buffer = None self.audio_stream = PyAudio().open( format= paInt16, # Create an audio stream object from the microphone using PyAudio channels=1, rate=rate, input=True, frames_per_buffer=chunk) def _write_stream_to_file(self, filename, data): ''' Write contents of data to a Wave file. Args: filename (str) : name of Wave file to be written to data (list) : mono audio signal ''' wave_file = wave.open(f'./assets/{filename}.wav', 'wb') # Open the Wave file in binary write mode wave_file.setnchannels(1) # Set details of the data being written wave_file.setsampwidth(PyAudio().get_sample_size(paInt16)) wave_file.setframerate(self.rate) wave_file.writeframes( b''.join(data) ) # Convert the list into a binary string and (over)write to the Wave file wave_file.close() def _framing(self, data): ''' Transform audio signal into a series of overlapping frames. A frame (sample) is the amplitude at a point in time. Args: data (list) : mono audio signal Returns: frames (list) : all the frames frame_length (int) : length of each frame ''' frame_length = int( .025 * self.rate ) # Frame length = (window length) * (rate), .025 secs chosen arbitrarily frame_step = int( .01 * self.rate ) # Used to convert from seconds to samples, .01 secs between windows chosen arbitrarily signal_length = len(data) number_of_frames = int( np.ceil(abs(signal_length - frame_length) / frame_step)) # Check there is at least one frame # Find indices index_a = np.tile( np.arange(0, frame_length), (number_of_frames, 1) ) # numpy.arange(start,stop,step) returns evenly spaced values between start & stop # numpy.tile(array, repeats) constructs an array by repeating the given array in each given axis (repeats) index_b = np.tile( np.arange(0, number_of_frames * frame_step, frame_step), (frame_length, 1)) index_b = np.transpose( index_b ) # Rearrange the array so rows become columns and colums become rows indices = index_a + index_b # Pad out the signal to ensure the frames have at least the same length as the indices array padding_amount = number_of_frames * frame_step + frame_length padding = np.zeros( (padding_amount - signal_length)) # Creates a numpy array filled entirely of zeros padded_buffer = np.append(data, padding) # Merges two arrays into one frames = padded_buffer[indices.astype( np.int32, copy=False )] # .astype(dtype, copy=False) changes the type of the indices array to int32 return frames, frame_length def _get_dominant_frequency(self, frame): ''' Find the dominant frequency of a single frame. Args: frame (numpy.ndarray) : amplitude information at a point in time Returns: (float) : dominant frequency in Hz ''' nfft = 2**14 # Fast fourier transform points to be calculated fourier_transform = np.fft.rfft( frame, nfft) # Perform a fast fourier transform on a real input magnitude_spectrum = (1 / nfft) * abs(fourier_transform) power_spectrum = (1 / nfft)**2 * magnitude_spectrum**2 frequencies = np.fft.fftfreq( len(power_spectrum), 1 / self.rate ) # Gives the frequencies associated with the coefficients: .fftfreq(window_length,sampling_spacing) where sampling_spacing is the inverse of sampling rate frequencies = ( frequencies[np.where(frequencies >= 0)] // 2 ) + 1 # Filter out negative frequencies and return the floor division of 2 for each frequency. Finally, add 1 to each frequency power_spectrum = power_spectrum[:len( frequencies )] # Take only the first half of the spectra as only the first part contains useful data maxiumum_index = np.argmax( power_spectrum ) # .argmax() returns the maximum values along an axis return frequencies[ maxiumum_index] # Convert the dominant frequency to Hz def stream(self, time=.1): ''' Update audio stream buffer. Args: time (float) : length of audio stream buffer in seconds default: 0.1 ''' # To record (time) seconds into the buffer, we must take (rate)*(time) samples. # In each iteration (chunk) samples are taken, so we must loop (rate)*(time)/(chunk) times. buffer_hex = [ self.audio_stream.read(self.chunk) for i in range(int(self.rate / self.chunk * time)) ] self._write_stream_to_file('buffer', buffer_hex) self.rate, self.buffer = wavfile.read('./assets/buffer.wav') def get_dominant_frequencies(self): ''' Analyse the buffer data to find the dominant frequencies. Returns: dominant_frequencies (list) : list of the dominant frequencies identified ''' # Perform framing on the signal frames, frame_length = self._framing(self.buffer) # Perform Hamming window function on the frames windows = frames * np.hamming( frame_length ) # w(n) = .54 - .46*cos((2*(pi)*n)/(M-1)) , 0 <= n <= M-1 where M = number of points in the output window dominant_frequencies = np.array([ self._get_dominant_frequency(window) for window in windows ]) # Find the dominant frequency for each frame dominant_frequencies = np.round(dominant_frequencies, 3) # Round to three decimal places dominant_frequencies = np.unique( dominant_frequencies) # Remove all duplicate values return dominant_frequencies def get_note_from_frequency(self, notes_dict, frequencies): ''' Convert a list of frequencies into their likeliest music note. Args: notes_dict (dict) : dictionary of notes and their associated frequencies frequencies (list) : list of frequencies Returns: note (str) : single note or None if no note identified ''' if 1.0 in frequencies.tolist(): return 'rest' # If 1.0 is a dominant frequency assume it is background noise for note in notes_dict.keys(): target = notes_dict[note] weight = 0 for freq in frequencies: min_distance_from_target = min([ abs(100 * round( np.sin((np.pi / np.log(2)) * np.log(freq / value)), 4)) for value in target ]) if not min_distance_from_target: min_distance_from_target = -100 weight += min_distance_from_target try: if weight < closest_match[1]: closest_match = [note, weight] except NameError: # On the first iteration closest_match has not yet been declared closest_match = [note, weight] return closest_match[0]