def signalMixer(data1: np.ndarray, data2: np.ndarray, saveFilePath="./mixed.wav"): segment1 = pydub.AudioSegment(data1, sample_width=Cai.sampleWidthInBytes, frame_rate=Cai.frameRate, channels=Cai.numberOfChannels) segment2 = pydub.AudioSegment(data2, sample_width=Cai.sampleWidthInBytes, frame_rate=Cai.frameRate, channels=Cai.numberOfChannels) combined_sounds = segment1 + segment2 combined_sounds.export(saveFilePath, format="wav") Logger.info("saved mixed audio file at {}".format(saveFilePath))
def record(*, target_segment_duration: int = 5, output_queue: multiprocessing.Queue): """Record from the default microphone and write segments. Write segments as .ts files (MPEG-TS) along with a master.m3u8 playlist. """ rate = 44100 chunk_size = rate // 10 stream = make_stream(chunk_size, rate, channels=1) while True: frames = [] n_frames = round(target_segment_duration / (chunk_size / rate)) for _ in range(n_frames): data = stream.read(chunk_size, exception_on_overflow=True) frames.append(data) segment = pydub.AudioSegment(data=b"".join(frames), sample_width=2, frame_rate=44100, channels=1) output_queue.put(segment)
def _cached_get_segment_audio_data(audio_file_name, database_id, fs, start, end): wav_file_path = data_path('audio/wav/{}'.format(database_id), '{}.wav'.format(audio_file_name)) chunk = wavfile.read_segment(wav_file_path, start, end, normalised=False, mono=True) audio_segment = pydub.AudioSegment(chunk.tobytes(), frame_rate=fs, sample_width=chunk.dtype.itemsize, channels=1) audio_segment = _match_target_amplitude(audio_segment) out = io.BytesIO() audio_segment.export(out, format=settings.AUDIO_COMPRESSED_FORMAT) binary_content = out.getvalue() out.close() response = HttpResponse() response.write(binary_content) response['Content-Type'] = 'audio/' + settings.AUDIO_COMPRESSED_FORMAT response['Content-Length'] = len(binary_content) return response
def make_spectrogram(self, export=False, filename="{}/spectrogram.png".format(FIGURE_DIR)): x = self.tape audio = pydub.AudioSegment(x.tobytes(), frame_rate=FRAME_RATE, channels=1, sample_width=x.dtype.itemsize) waveform = np.array(audio.get_array_of_samples(), dtype=np.float32) signals = tf.reshape(waveform, [1, -1]) stfts = tf.contrib.signal.stft(signals, frame_length=FFT_FRAME_LENGTH, frame_step=FFT_FRAME_STEP, fft_length=FFT_LENGTH) magnitude_spectrograms = tf.abs(stfts) num_spectrogram_bins = magnitude_spectrograms.shape[-1].value linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix(NUM_MEL_BINS, num_spectrogram_bins, FRAME_RATE, LOWER_EDGE_HERTZ, UPPER_EDGE_HERTZ) mel_spectrograms = tf.tensordot(magnitude_spectrograms, linear_to_mel_weight_matrix, 1) log_mel_spectrograms = tf.log(mel_spectrograms + tf.keras.backend.epsilon()) if export: sns_plot = sns.heatmap(np.swapaxes(log_mel_spectrograms.numpy()[0], 0, 1)) sns_plot.get_figure().savefig(filename) plt.close('all') return log_mel_spectrograms
def start(self): segment = self.segment or AUDIO_SEGMENT_LENGTH self.num_frames = int(RATE / FRAMES_PER_BUFFER * segment) if self.seconds: signal.setitimer(signal.ITIMER_REAL, self.seconds) if self.verbose: self._timer = time.time() if self.collect: print('Collecting RMS values...') if self.action: # Interpret threshold self.get_threshold() try: self.is_running = True record = self.record() while not self._graceful: record.send(True) # Record stream `AUDIO_SEGMENT_LENGTH' long data = self.output.getvalue() segment = pydub.AudioSegment(data) rms = segment.rms if self.collect: self.collect_rms(rms) self.meter(rms) if self.action: if self.is_triggered(rms): self.execute(rms) self.monitor(rms) self.is_running = False self.stop() except self.__class__.StopException: self.is_running = False self.stop()
def write(path, sr, x, codec, normalized=False): """numpy array to MP3""" audio_segment = pydub.AudioSegment(x.tobytes(), frame_rate=sr, sample_width=x.dtype.itemsize, channels=1) audio_segment.export(path, format=codec)
def toAudio(self, rate, signal, channels): channel1 = signal[:, 0] audio_segment = pydub.AudioSegment( channel1.tobytes(), frame_rate=rate, sample_width=channel1.dtype.itemsize, channels=channels) return audio_segment
def write(f, sr, x, normalized=False): """numpy array to MP3""" channels = 2 if (x.ndim == 2 and x.shape[1] == 2) else 1 if normalized: # normalized array - each item should be a float in [-1, 1) y = np.int16(x * 2 ** 15) else: y = np.int16(x) song = pydub.AudioSegment(y.tobytes(), frame_rate=sr, sample_width=2, channels=channels) song.export(f, format="mp3", bitrate="320k")
def create_audio_segments_udf( audio_bytes_series: pd.Series, audio_type_series: pd.Series, audio_name_series: pd.Series, start_ms_array_series: pd.Series, end_ms_array_series: pd.Series, output_audio_codec_series: pd.Series, ) -> pd.DataFrame: output_array = [] assert ( len(audio_bytes_series) == len(audio_type_series) and len(audio_type_series) == len(audio_name_series) and len(audio_name_series) == len(start_ms_array_series) and len(start_ms_array_series) == len(end_ms_array_series) and len(end_ms_array_series) == len(output_audio_codec_series) ) for ( audio_bytes, audio_type, audio_name, start_ms_array, end_ms_array, output_audio_codec, ) in zip( audio_bytes_series, audio_type_series, audio_name_series, start_ms_array_series, end_ms_array_series, output_audio_codec_series, ): assert audio_type == "mp3" decoded_bytes = DecodeToRawPipe(audio_bytes, audio_type) audio_segment = pydub.AudioSegment( decoded_bytes, frame_rate=16_000, sample_width=2, channels=1 ) segmented_audio = {"audio_name": [], "audio": []} for i, (start_ms, end_ms) in enumerate(zip(start_ms_array, end_ms_array)): chunk = audio_segment[start_ms:end_ms] assert ( abs(len(chunk.raw_data) / 16_000 / 2) * 1000 - (end_ms - start_ms) ) <= 1.0, ( f"{(len(chunk.raw_data) / 16_000 / 2) * 1000} vs. {end_ms - start_ms}" ) segment_flac_bytes = EncodeFromRawPipe(chunk.raw_data, output_audio_codec) segmented_audio["audio"].append(segment_flac_bytes) output_array.append(segmented_audio) audio_segment_names_series = create_audio_segment_names_udf.func( audio_name_series, end_ms_array_series.transform(len), output_audio_codec_series ) assert len(output_array) == len(audio_segment_names_series) for i, audio_segment_names in enumerate(audio_segment_names_series): output_array[i]["audio_name"] = audio_segment_names assert len(output_array[i]["audio_name"]) == len(output_array[i]["audio"]) return pd.DataFrame(output_array)
def write(self, f, sr, x, normalized=False): channels = 2 if (x.ndim == 2 and x.shape[1] == 2) else 1 if normalized: y = np.int16(x * 2**15) else: y = np.int16(x) song = pydub.AudioSegment(y.tobytes(), frame_rate=sr, sample_width=2, channels=channels) song.export(f, format="mp3", bitrate="320k")
def save_output(output_filepath, stereo_array): '''Writes a stereo array to the output filepath -- accepts .wav, .mp3, .ogg, or anything else supported by ffmpeg ''' pydub.AudioSegment.from_mono_audiosegments(*[ pydub.AudioSegment( (channel * 32768).astype(np.int16).tobytes(), frame_rate=44100, sample_width=np.dtype(np.int16).itemsize, channels=1, ) for channel in stereo_array.reshape((-1, 2)).T ]).export(output_filepath, format=output_filepath.split('.')[-1])
def play_byte_stream(self, data): if not self._audio_device: return 0 self.stop_playback() sample_width = 2 # 16 bit pcm frame_rate = 16000 # sample rate channels = 2 # stereo signal audio = pydub.AudioSegment(data=data, sample_width=sample_width, frame_rate=frame_rate, channels=channels) play(audio)
def _write_wave_file(np_audio, path): """Creates a random audio file.""" num_channels = np_audio.shape[1] if len(np_audio.shape) == 2 else 1 audio = pydub.AudioSegment( b'', sample_width=2, channels=num_channels, frame_rate=1, ) # See documentation for _spawn usage: # https://github.com/jiaaro/pydub/blob/master/API.markdown#audiosegmentget_array_of_samples audio = audio._spawn( array.array(audio.array_type, np_audio.reshape((-1, )))) audio.export(path, format='wav')
def get_audiosegment(audio_file, desired_framerate, format="mp3"): audio = pydub.AudioSegment.from_file(audio_file, format=format) logger.debug( f"Audio duration before framerate change: {audio.duration_seconds}") data = convert_samplerate(audio, desired_framerate) del audio audio = pydub.AudioSegment(data, sample_width=2, channels=1, frame_rate=desired_framerate) audio_seconds = audio.duration_seconds logger.debug(f"Audio duration after change: {audio_seconds}") return audio, audio_seconds
def ApplyImpulseResponse(cls, signal, impulse_response): """Applies an impulse response to a signal. Args: signal: AudioSegment instance. impulse_response: list or numpy vector of float values. Returns: AudioSegment instance. """ # Get samples. assert signal.channels == 1, ( 'multiple-channel recordings not supported') samples = signal.get_array_of_samples() # Convolve. logging.info( 'applying %d order impulse response to a signal lasting %d ms', len(impulse_response), len(signal)) convolved_samples = scipy.signal.fftconvolve(in1=samples, in2=impulse_response, mode='full').astype( np.int16) logging.info('convolution computed') # Cast. convolved_samples = array.array(signal.array_type, convolved_samples) # Verify. logging.debug('signal length: %d samples', len(samples)) logging.debug('convolved signal length: %d samples', len(convolved_samples)) assert len(convolved_samples) > len(samples) # Generate convolved signal AudioSegment instance. convolved_signal = pydub.AudioSegment(data=convolved_samples, metadata={ 'sample_width': signal.sample_width, 'frame_rate': signal.frame_rate, 'frame_width': signal.frame_width, 'channels': signal.channels, }) assert len(convolved_signal) > len(signal) return convolved_signal
def stream_utterance(self, audio_stream): silence_buffer = pydub.AudioSegment.empty() voice_buffer = pydub.AudioSegment.empty() silence_threshold = False for avf in audio_stream: audio_bytes = avf.to_ndarray().tobytes() c = ( pydub.AudioSegment( data=audio_bytes, frame_rate=avf.sample_rate, channels=len(avf.layout.channels), sample_width=avf.format.bytes, ) .set_channels(1) .set_sample_width(2) .set_frame_rate(16000) ) voice_frame = is_frame_voice(self.vad, c, self.chunk_dur) # logger.info(f"is audio stream voice? {voice_frame}") if voice_frame: silence_threshold = False voice_buffer += c silence_buffer = pydub.AudioSegment.empty() else: silence_buffer += c voc_dur = voice_buffer.duration_seconds * 1000 sil_dur = silence_buffer.duration_seconds * 1000 if voc_dur >= self.max_utt: # logger.info( # f"detected voice overflow: voice duration {voice_buffer.duration_seconds}" # ) yield voice_buffer voice_buffer = pydub.AudioSegment.empty() if sil_dur >= self.max_sil: if voc_dur >= self.min_utt: # logger.info( # f"detected silence: voice duration {voice_buffer.duration_seconds}" # ) yield voice_buffer voice_buffer = pydub.AudioSegment.empty() # ignore/clear voice if silence reached threshold or indent the statement if not silence_threshold: silence_threshold = True if voice_buffer: yield voice_buffer
def Copy(cls, signal): """Makes a copy os a signal. Args: signal: AudioSegment instance. Returns: An AudioSegment instance. """ return pydub.AudioSegment(data=signal.get_array_of_samples(), metadata={ 'sample_width': signal.sample_width, 'frame_rate': signal.frame_rate, 'frame_width': signal.frame_width, 'channels': signal.channels, })
def write(f, x, sr=44100, normalized=True): # Check the amount of channels that we need to convert the file to channels = 2 if (x.ndim == 2 and x.shape[1] == 2) else 1 # If the array is normalized, scale the array by 2**15, else do nothing to the array. if normalized: # normalized array - each item should be a float in [-1, 1) y = np.int16(x * 2**15) else: y = np.int16(x) # Convert the array to mp3 with pydub. song = pydub.AudioSegment(y.tobytes(), frame_rate=sr, sample_width=2, channels=channels) song.export(f, format="mp3", bitrate="64k")
def speed_pydub(samples, min_speed=0.9, max_speed=1.1): """ pydub变速 :param samples: 音频数据,一维 :param max_speed: 不要低于0.9,太低效果不好 :param min_speed: 不要高于1.1,太高效果不好 :return: """ samples = samples.copy() # frombuffer()导致数据不可更改因此使用拷贝 data_type = samples[0].dtype speed = random.uniform(min_speed, max_speed) segment = pydub.AudioSegment(samples) samples = pydub.audio_segment.effects.speedup(segment, playback_speed=1.2) samples = samples.astype(data_type) return samples
def soundcheck_kit(kit_name=None, kit_size=None): for i in range(kit_size): print('record samples for drum: {}, press "r" to start recording, "s" to stop and save samples'.format(i)) audio, midi_notes = soundcheck_listener() # get most frequently hit note note = int(midi_notes['key'].value_counts().idxmax()) label = f_to_l_map[note] instrument_name = names_l_map[label] sc_audio_filename = './soundcheck/' + kitname + '_' + instrument_name + '.mp3' audio_segment = pydub.AudioSegment( audio.tobytes(), frame_rate=RATE, sample_width=2, channels=1 ) audio_segment.export(sc_audio_filename, format='mp3') print('done')
def import_pcm(song, cur, audio_file, wav_file_path=None, compressed_url=None): if wav_file_path is None: wav_file_path = wav_path(audio_file) if compressed_url is None: compressed_url = audio_path(audio_file, settings.AUDIO_COMPRESSED_FORMAT) if not os.path.isfile(wav_file_path): # print('Importing {}'.format(song_name)) song_id = song['songid'] cur.execute('select wav from wavs where songid={};'.format(song_id)) data = cur.fetchone() raw_pcm = str_to_bytes(data[0]) nchannels = song['stereo'] bitrate = int(song['ssizeinbits']) fs = int(song['samplerate']) byte_per_frame = int(bitrate / 8) nframes_all_channel = int(len(raw_pcm) / byte_per_frame) nframes_per_channel = int(nframes_all_channel / nchannels) length = nframes_per_channel ensure_parent_folder_exists(wav_file_path) if bitrate == 24: array1 = np.frombuffer(raw_pcm, dtype=np.ubyte) array2 = array1.reshape((nframes_per_channel, nchannels, byte_per_frame)).astype(np.uint8) wf.write_24b(wav_file_path, fs, array2) else: data = array.array('i', raw_pcm) sound = pydub.AudioSegment(data=data, sample_width=byte_per_frame, frame_rate=fs, channels=nchannels) sound.export(wav_file_path, 'wav') else: fs, length = get_wav_info(wav_file_path) if not os.path.isfile(compressed_url): ensure_parent_folder_exists(compressed_url) sound = pydub.AudioSegment.from_wav(wav_file_path) sound.export(compressed_url, format=settings.AUDIO_COMPRESSED_FORMAT) return fs, length
def callback(self, in_data, frame_count, time_info, status): segment = pydub.AudioSegment(in_data, sample_width=self.WIDTH, frame_rate=self.RATE, channels=self.CHANNELS) # 是否到达阈值 reached = segment.rms > self.threshold if reached and not self.recording: self.recording = True #开始记录 self.wav_buff = [] # 清空原始波数据 if not reached: #RMS声音回落至阈值下面 调用触发器 传递记录的录音数据 if self.recording: self.trigger(self.wav_buff) self.recording = False if self.recording: self.wav_buff.append(in_data) return (b'', pyaudio.paContinue)
def start(self): segment = self.segment_length or AUDIO_SEGMENT_LENGTH self.num_frames = int(RATE / FRAMES_PER_BUFFER * segment) try: self.is_running = True record = self.record() while not self._graceful: next(record) # Record stream `AUDIO_SEGMENT_LENGTH' long in the generator method 'record' data = self.output.getvalue() segment = pydub.AudioSegment(data) rms = segment.rms dbfs = segment.dBFS self.meter(rms, dbfs) self.is_running = False self.stop() except self.__class__.StopException: self.is_running = False self.stop()
def tts_sdk(text, **kwargs): """长文本的语音合成,包含简单分句模块。""" text_split_lst = split_text(text, kwargs.get('maxlen', 30)) wav_lst = [] for text_split in text_split_lst: logger.info(f'Synthesizing: {text_split}') wav = tts_sdk_base(text_split, **kwargs) wav_lst.append(wav) sil = pydub.AudioSegment.silent(300, frame_rate=kwargs.get( 'sampling_rate', 22050)) wav_out = sil for wav in wav_lst: wav = pydub.AudioSegment(wav) wav_out = wav_out + wav + sil out = io.BytesIO() wav_out.export(out, format='wav') return out.getvalue()
def create_media(): data = request.data filename = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S.mp3") user = github.get("user") userdir = os.path.join(app.config['UPLOAD_FOLDER'], user.data['login']) #wavdata, samplerate = sf.read(io.BytesIO(data)) if not os.path.exists(userdir): os.makedirs(userdir) #sf.write(os.path.join(userdir, filename), wavdata, samplerate) audio = pydub.AudioSegment(io.BytesIO(data)) audio.export(os.path.join(userdir, filename)) return jsonify({ "response": "success", "uri": "{}/media/{}/{}".format(request.base_url, user.data['login'], filename) })
def start(self, final_callback=None): self.final_callback = final_callback threshold_response_msg = ThresholdResponseMessage(True, self.threshold, self.num) # TODO: update topic - get rid of audioinput self.publish("audioinput/threshold/response", threshold_response_msg.to_json()) segment = self.segment or self.config.AUDIO_SEGMENT_LENGTH self.num_frames = int( self.config.RATE / self.config.FRAMES_PER_BUFFER * segment) if self.seconds: signal.setitimer(signal.ITIMER_REAL, self.seconds) if self.verbose: self._timer = time.time() if self.collect: print('Collecting RMS values...') if self.action: # Interpret threshold self.get_threshold() try: self.is_running = True record = self.record() while not self._graceful: record.send(True) # Record stream `AUDIO_SEGMENT_LENGTH' long data = self.output.getvalue() segment = pydub.AudioSegment(data) rms = segment.rms if self.collect: self.collect_rms(rms) self.meter(rms) if self.action: if self.is_triggered(rms): self.execute(rms) self.monitor(rms) self.is_running = False self.stop() except self.__class__.StopException: self.is_running = False self.stop()
def process_audio(frame: av.AudioFrame) -> av.AudioFrame: raw_samples = frame.to_ndarray() sound = pydub.AudioSegment( data=raw_samples.tobytes(), sample_width=frame.format.bytes, frame_rate=frame.sample_rate, channels=len(frame.layout.channels), ) sound = sound.apply_gain(gain) # Ref: https://github.com/jiaaro/pydub/blob/master/API.markdown#audiosegmentget_array_of_samples # noqa channel_sounds = sound.split_to_mono() channel_samples = [s.get_array_of_samples() for s in channel_sounds] new_samples: np.ndarray = np.array(channel_samples).T new_samples = new_samples.reshape(raw_samples.shape) new_frame = av.AudioFrame.from_ndarray(new_samples, layout=frame.layout.name) new_frame.sample_rate = frame.sample_rate return new_frame
def apply_pydub(self, samples, sample_rate): try: import pydub except ImportError: print( "Failed to import pydub. Maybe it is not installed? " "To install the optional pydub dependency of audiomentations," " do `pip install audiomentations[extras]` instead of" " `pip install audiomentations`", file=sys.stderr, ) raise assert len(samples.shape) == 1 assert samples.dtype == np.float32 int_samples = convert_float_samples_to_int16(samples) audio_segment = pydub.AudioSegment( int_samples.tobytes(), frame_rate=sample_rate, sample_width=int_samples.dtype.itemsize, channels=1, ) tmp_dir = tempfile.gettempdir() tmp_file_path = os.path.join( tmp_dir, "tmp_compressed_{}.mp3".format(str(uuid.uuid4())[0:12]) ) bitrate_string = "{}k".format(self.parameters["bitrate"]) file_handle = audio_segment.export(tmp_file_path, bitrate=bitrate_string) file_handle.close() degraded_samples, _ = librosa.load(tmp_file_path, sample_rate) os.unlink(tmp_file_path) return degraded_samples
def save_audio(array, filename, sample_rate, dtype=np.int16, format=None): dtype = np.dtype(dtype) allowed_dtypes = [np.int8, np.int16, np.int32, np.int64] if dtype not in allowed_dtypes: raise TypeError("The dtype must be one of " + str(allowed_dtypes)) if np.ndim(array) != 1: raise TypeError("Saving multi-channel audio is not supported!") if format is None: name = os.path.basename(filename) ext = name.rfind('.') if ext == -1: raise ValueError("Can not infer output format from the filename!") format = name[ext + 1:] array = convert_dtype(array, dtype) segment = pydub.AudioSegment(array.tobytes(), sample_width=dtype.itemsize, channels=1, frame_rate=sample_rate) segment.export(filename, format)
def _build_audio(audio_file_id: int, applied_length: Optional[int] = None, volume: int = 255): if applied_length is not None and applied_length < 0: raise BadWriteError(f"File {audio_file_id}: length {applied_length} < 0") if volume < 0 or volume > 255: raise BadWriteError(f"The volume is {volume} but must be 0–255") import valarpy.model as model valar_obj = model.AudioFiles.select().where(model.AudioFiles.id == audio_file_id).first() if valar_obj is None: raise UnrecognizedKeyError(f"No audio file with ID {audio_file_id}".) song = pydub.AudioSegment(data=valar_obj.data, sample_width=2, frame_rate=44100, channels=1) n_sec_valar = valar_obj.n_seconds * 1000 length_delta = abs(len(song) - n_sec_valar) if length_delta > 0.00001: raise AssertionError(f"File {audio_file_id} is {len(song)}, but Valar says it’s {n_sec_valar}") if applied_length is None: resized = song else: n_repeats = math.ceil(applied_length / len(song)) resized = (song * n_repeats)[0:applied_length] if volume == 0 or applied_length == 0: final = pydub.AudioSegment.silent(duration=0.5) else: # noinspection PyTypeChecker volume_floor = config.get_float("sauron.hardware.stimuli.audio.audio_floor") volume_ceil = config.get_float("sauron.hardware.stimuli.audio.audio_ceil") # final = resized + (volume * (volume_floor / 255) - volume_floor) # print(volume * (volume_ceil - volume_floor) / 255 + volume_floor) final = resized + volume * (volume_ceil - volume_floor) / 255 + volume_floor play_obj = sa.WaveObject(final.raw_data, 1, 2, 44100) return AudioInfo(play_obj, applied_length, volume)