def spawn(self, data, overrides: dict = {}): # just make pydub and PEP8 happy :P if isinstance(data, list): data = array(get_array_type(self.sample_width * 8), data) if isinstance(data, np.ndarray): data = Audio.get_flatten_samples(data) data = stretch_samples(data, self.sample_width).tolist() data = array(get_array_type(self.sample_width * 8), data) return self._spawn(data, overrides)
def createSpectrogram(sceneAudioSegment, freqResolution, timeResolution, windowLength, windowOverlap): highestFreq = sceneAudioSegment.frame_rate / 2 height = highestFreq // freqResolution width = sceneAudioSegment.duration_seconds * 1000 // timeResolution # Set figure settings to remove all axis spectrogram = plt.figure(frameon=False) spectrogram.set_size_inches(width / 100, height / 100) ax = plt.Axes(spectrogram, [0., 0., 1., 1.]) ax.set_axis_off() spectrogram.add_axes(ax) # Generate the spectrogram # See https://matplotlib.org/api/_as_gen/matplotlib.pyplot.specgram.html?highlight=matplotlib%20pyplot%20specgram#matplotlib.pyplot.specgram Pxx, freqs, bins, im = ax.specgram( x=np.frombuffer(sceneAudioSegment._data, dtype=get_array_type( 8 * sceneAudioSegment.frame_width)), Fs=sceneAudioSegment.frame_rate, window=matplotlib.mlab.window_hanning, NFFT=windowLength, noverlap=windowOverlap, scale='dB') return spectrogram
def split_song(self, song): mydict = [] convers = [] for i in range(3000, len(song) + 3000, 3000): # print i try: splitting = song[i - 3000:i] # first three seconds bit_depth = splitting.sample_width * self.BIT_PRECISION # print splitting.frame_rate array_type = get_array_type(bit_depth) print(len(splitting._data)) print(array_type) numeric_array = array.array(array_type, splitting._data) numeric_array = numeric_array.tolist() #print(splitting.frame_rate) features = self.extract_features2(self.sampling_rate, np.asarray(numeric_array))[0] features_transformed = (features - self.mean_train) / self.sd_train convers.append(features_transformed) except: continue #if len(convers) == 3: # prediction = self.my_attention_network.predict(np.array([convers]))[0] # print prediction # mydict.append({"Anger": prediction[0], "Disgust": prediction[1], "Fear": prediction[3], # "Happiness": prediction[5], "Neutral": prediction[6], "Sadness": prediction[2], # "Surprise": prediction[4]}) # convers.pop(0) return convers
def split_song_get_emotion(self, song,len_sequence,byte_depth): mydict = [] convers = [] increment = len(song)/len_sequence for i in range(increment, len(song)+increment, increment): #print i splitting = song[i - increment:i] # first three seconds bit_depth = splitting.sample_width * byte_depth # print splitting.frame_rate array_type = get_array_type(bit_depth) numeric_array = array.array(array_type, splitting._data) numeric_array = numeric_array.tolist() features = self.extract_features3(splitting.frame_rate, np.asarray(numeric_array))[0] features_transformed = (features - self.mean_train) / self.sd_train convers.append(features_transformed) #print len(convers) if len(convers) == len_sequence: prediction = self.my_attention_network.predict(np.array([convers]))[0] #print prediction mydict.append({"Anger": prediction[0], "Disgust": prediction[1], "Fear": prediction[3], "Happiness": prediction[5], "Neutral": prediction[6], "Sadness": prediction[2], "Surprise": prediction[4]}) convers.pop(0) data_frame_emotions = pd.DataFrame.from_dict(mydict) return data_frame_emotions
def split_song_get_features(self, song): mydict = [] convers = [] #increment = 3000 #if len(song)< 9000: increment = int(float(len(song))/3) for i in range(increment,len(song)+increment, increment): # print i splitting = song[i-increment:i] # first incremement seconds bit_depth = splitting.sample_width * 8 # print splitting.frame_rate array_type = get_array_type(bit_depth) numeric_array = array.array(array_type, splitting._data) numeric_array = numeric_array.tolist() features = self.extract_features2(splitting.frame_rate, np.asarray(numeric_array))[0] features_transformed = (features - self.mean_train) / self.sd_train convers.append(features_transformed) return convers
def frequency_spectrum(sample, max_frequency=800): # Convert pydub.AudioSample to raw audio data bit_depth = sample.sample_width * 8 array_type = get_array_type(bit_depth) raw_audio_data = array.array(array_type, sample._data) n = len(raw_audio_data) # Compute FFT and frequency value for each index in FFT array freq_array = np.arange(n) * (float(sample.frame_rate) / n ) # two sides frequency range freq_array = freq_array[:(n // 2)] # one side frequency range raw_audio_data = raw_audio_data - np.average( raw_audio_data) # zero-centering freq_magnitude = np.fft.fft( raw_audio_data) # fft computing and normalization freq_magnitude = freq_magnitude[:(n // 2)] # one side if max_frequency: max_index = int(max_frequency * n / sample.frame_rate) + 1 freq_array = freq_array[:max_index] freq_magnitude = freq_magnitude[:max_index] freq_magnitude = abs(freq_magnitude) freq_magnitude = freq_magnitude / np.sum(freq_magnitude) return freq_array, freq_magnitude
def frequency_spectrum(sample, max_frequency=800): """ Derive frequency spectrum of a signal pydub.AudioSample Returns an array of frequencies and an array of how prevelant that frequency is in the sample """ # Convert pydub.AudioSample to raw audio data # Copied from Jiaaro's answer on https://stackoverflow.com/questions/32373996/pydub-raw-audio-data bit_depth = sample.sample_width * 8 array_type = get_array_type(bit_depth) raw_audio_data = array.array(array_type, sample._data) n = len(raw_audio_data) # Compute FFT and frequency value for each index in FFT array # Inspired by Reveille's answer on https://stackoverflow.com/questions/53308674/audio-frequencies-in-python freq_array = np.arange(n) * (float(sample.frame_rate) / n ) # two sides frequency range freq_array = freq_array[:(n // 2)] # one side frequency range raw_audio_data = raw_audio_data - np.average( raw_audio_data) # zero-centering freq_magnitude = scipy.fft.fft( raw_audio_data) # fft computing and normalization freq_magnitude = freq_magnitude[:(n // 2)] # one side if max_frequency: max_index = int(max_frequency * n / sample.frame_rate) + 1 freq_array = freq_array[:max_index] freq_magnitude = freq_magnitude[:max_index] freq_magnitude = abs(freq_magnitude) freq_magnitude = freq_magnitude / np.sum(freq_magnitude) return freq_array, freq_magnitude
def configure(self, sample_rate, buffer_size): self.mono = self.sound.split_to_mono()[0] self.sample_rate = sample_rate self.mono = self.mono.set_frame_rate(sample_rate) self.buffer_size = buffer_size bit_depth = self.mono.sample_width * 8 array_type = get_array_type(bit_depth) self.numeric_array = np.array(array.array(array_type, self.mono._data))
def getMixedChannels(sound): # Combines two channels of a loaded song into a single array (left, right) = (sound.split_to_mono()[0], sound.split_to_mono()[1]) bit_depth = left.sample_width * 8 array_type = get_array_type(bit_depth) (signalL, signalR) = (array.array(array_type, left._data), array.array(array_type, right._data)) mix = [signalL[i] + signalR[i] for i in range(len(signalL))] return mix
def __init__(self, file): self.filename = os.path.splitext(os.path.basename(file))[0] sound = AudioSegment.from_file( file=file, format=file.split('.')[1]).set_channels(1) self.sound_raw = np.frombuffer( sound._data, dtype=get_array_type(sound.sample_width * 8)).astype(np.float64, copy=False) self.sound_raw.setflags(write=1) self.raw_length = len(self.sound_raw) self.raw_increment = int( MS_INCREMENT * (len(self.sound_raw) / sound.duration_seconds / 1000)) self.sample_rate = sound.frame_rate self.mpm = Mpm()
def generate_random_noise(duration, gain, frame_width, sample_rate): bit_depth = 8 * frame_width minval, maxval = get_min_max_value(bit_depth) sample_width = get_frame_width(bit_depth) array_type = get_array_type(bit_depth) gain = db_to_float(gain) sample_count = int(sample_rate * (duration / 1000.0)) data = ((np.random.rand(sample_count, 1) * 2) - 1.0) * maxval * gain return AudioSegment(data=data.astype(array_type).tobytes(), metadata={ "channels": 1, "sample_width": sample_width, "frame_rate": sample_rate, "frame_width": sample_width, })
def split_song2(self,song,padding_length): bit_depth = song.sample_width * self.BIT_PRECISION array_type = get_array_type(bit_depth) numeric_array = array.array(array_type, song._data) numeric_array = numeric_array.tolist() features = self.extract_features3(song.frame_rate, np.asarray(numeric_array)) #print("$$") #print(len(features)) while len(features)<padding_length: features=np.append(features,(np.zeros((1,34)))) #padding if len(features)>padding_length: features= features[len(features)-padding_length:len(features)] #print(np.shape(features)) print(len(features)) #print("$$") return features
def mp3preprocess(path): try: print(path) sound = AudioSegment.from_file(file=path) mono = sound.split_to_mono()[0] # TODO maybe concat both sides to have more mono data mono = mono.set_frame_rate(SAMPLE_RATE) bit_depth = mono.sample_width * 8 array_type = get_array_type(bit_depth) numeric_array = np.array(array.array(array_type, mono._data)) numeric_array = numeric_array[:-(len(numeric_array)%NSAMPLES)] frames = np.array_split(numeric_array, len(numeric_array)/NSAMPLES) frames = map(partial(np.fft.fft, norm="ortho"), frames) frames = [f[:NSAMPLES//2 +1] for f in frames] frames = map(np.absolute, frames) frames = [f/NSAMPLES for f in frames] return np.array(list(frames)) except: print("Error "+path) return None
def create_folder_raw_array(folder): folder_files = listdir(os.path.join(CUT_DIR, folder)) folder_files = natural_sort(folder_files) create_directory(os.path.join(UNFILTERED_PATH)) array_file = h5py.File(os.path.join(UNFILTERED_PATH, folder + '.hdf5'), 'w') number_of_images = len(folder_files) all_data = [] for i, file in enumerate(folder_files): # print progress print('\rFolder: %s %d/%d\r' % (folder, i, number_of_images)) # read in a wav file data = AudioSegment.from_file(os.path.join(CUT_DIR, folder, file), format='wav') bit_depth = data.sample_width * 8 array_type = get_array_type(bit_depth) numeric_array = array.array(array_type, data._data) all_data.append(numeric_array) amplitudes = array_file.create_dataset('waveform', data=all_data, dtype='i') folder_index = get_folder_class_index(CUT_DIR, folder) if is_random_forest: # PY3 unicode dt = h5py.special_dtype(vlen=str) array_file.create_dataset('labels', data=np.transpose([folder.encode('utf8')] * len(all_data)), dtype=dt) else: array_file.create_dataset('labels', data=np.transpose([folder_index] * len(all_data)), dtype='i')
def split_single_song(self, song, splits): mydict = [] convers = [] #increment = 3000 #if len(song)< 9000: increment = int(float(len(song)) / splits) for i in range(increment, len(song) + increment, increment): # print i splitting = song[i - increment:i] # first incremement seconds bit_depth = splitting.sample_width * 8 # print splitting.frame_rate array_type = get_array_type(bit_depth) numeric_array = array.array(array_type, splitting._data) numeric_array = numeric_array.tolist() features = self.extract_features2(splitting.frame_rate, np.asarray(numeric_array))[0] features_transformed = (features - self.mean_train) / self.sd_train convers.append(features_transformed) if len(convers) == 3: prediction = self.my_attention_network.predict( np.array([convers]))[0] mydict.append({ "Anger": prediction[0], "Disgust": prediction[1], "Fear": prediction[3], "Happiness": prediction[5], "Neutral": prediction[6], "Sadness": prediction[2], "Surprise": prediction[4] }) #convers.pop(0) data_frame_emotions = pd.DataFrame.from_dict(mydict) return data_frame_emotions
def load_sample(file_path, frame_rate_output_Hz=None): """ :param file_path: Full path to the audio file. :param frame_rate_output_Hz: Change the frame rate of the audio. Keep original if None. :return: Normalised raw waveform [-1, 1]. """ audio_seg = AudioSegment.from_file(file_path) bit_depth = audio_seg.sample_width * 8 array_type = get_array_type(bit_depth) if frame_rate_output_Hz is not None and frame_rate_output_Hz != audio_seg.frame_rate: audio_seg = audio_seg.set_frame_rate(frame_rate_output_Hz) raw = np.array(array.array(array_type, audio_seg.raw_data), dtype=np.float64) raw /= math.pow( 2, bit_depth ) / 2 # Divide through maximum possible positive or negative number. return raw
def __init__(self, path, pitch_detector=None, plotter=None, ms_increment=100): self.filename = os.path.splitext(os.path.basename(path))[0] sound = AudioSegment.from_file(file=path, format="wav").set_channels(1) self.sound_raw = numpy.frombuffer( sound._data, dtype=get_array_type(sound.sample_width * 8)).astype(numpy.float64, copy=False) self.sound_raw.setflags(write=1) self.raw_length = len(self.sound_raw) self.ms_increment = ms_increment self.raw_increment = int( self.ms_increment * (len(self.sound_raw) / sound.duration_seconds / 1000)) self.sample_rate = sound.frame_rate self.pitch_detector = pitch_detector self.plotter = plotter
import array from pydub import AudioSegment from pydub.utils import get_array_type sound = AudioSegment.from_file(file="a.mp3") left = sound.split_to_mono()[0] bit_depth = left.sample_width * 8 array_type = get_array_type(bit_depth) numeric_array = array.array(array_type, left._data)
def getNumbericArray(sound_clip): bit_depth = sound_clip.sample_width * 8 array_type = get_array_type(bit_depth) numeric_array = array.array(array_type, sound_clip._data)
def cleanPredictions(array_type, pred): if array_type == 'h': for i, x in enumerate(pred): if x < -32768: pred[i] = -32768 elif (x > 32767): pred[i] = 32767 #encoder matters!! #load raw audio data audio_X = AudioSegment.from_mp3("lq_spaceoddity.mp3") audio_y = AudioSegment.from_mp3("hq_spaceoddity.mp3") audio_Z = AudioSegment.from_mp3("lq_cc.mp3") array_type = utils.get_array_type(audio_X.sample_width * 8) #get sample array turn into numpy array X = np.array(audio_X.get_array_of_samples()) y = np.array(audio_y.get_array_of_samples()) #make 2d X = np.reshape(X, (-1, 1)) y = np.reshape(y, (-1, 1)) #split the data 70/30 split = np.round(X.size * .6).astype(int) # we need to split so that we know how to reassemble X_train = X[0:split] X_test = X[split:] y_train = y[0:split] y_test = y[split:]
#!/usr/bin/env python3 import numpy from pydub import AudioSegment from pydub.utils import get_array_type import sys if __name__ == '__main__': try: inputfile = sys.argv[1] outputfile = sys.argv[2] except IndexError: print('usage: get_raw_audio.py infile.ext outfile.txt', file=sys.stderr) sys.exit(1) sound = AudioSegment.from_file( file=inputfile, format=inputfile.split('.')[-1]).set_channels(1) sound_raw = numpy.frombuffer(sound._data, dtype=get_array_type(sound.sample_width * 8)) print('sample rate: {0}'.format(sound.frame_rate), file=sys.stderr) with open(outputfile, 'w') as fwrite: for sample in sound_raw: fwrite.write('{0}\n'.format(sample))
def _split_recording(self, segments: pd.DataFrame) -> list: #from raw sound data and sampling rate, build the spectrogram as a matplotlib figure and return it def _create_spectrogram(data, sr): snd = Sound(data, sampling_frequency=sr) # this parameters were chosen to output a spectrogram useful for zooniverse applications (short sounds from babies) we did not feel the need to have flexibility on them spectrogram = snd.to_spectrogram( window_length=0.0075, maximum_frequency=8000, time_step=0.0001, frequency_step=0.1, window_shape=SpectralAnalysisWindowShape.GAUSSIAN) fig = plt.figure( figsize=(12, 6.75) ) #size of the image, we chose 1200x675 pixels for a better display on zooniverse gs = fig.add_gridspec(2, hspace=0, height_ratios=[ 1, 3 ]) #2 plots (spectrogram 3x bigger than oscillogram) axs = gs.subplots(sharex=True) #scpectrogram plot dynamic_range = 65 X, Y = spectrogram.x_grid(), spectrogram.y_grid() sg_db = 10 * log10(spectrogram.values) axs[1].pcolormesh(X, Y, sg_db, vmin=sg_db.max() - dynamic_range, cmap='Greys') axs[1].set_ylim([spectrogram.ymin, spectrogram.ymax]) axs[1].set_xlabel("time [s]") axs[1].set_ylabel("frequency [Hz]") axs[1].tick_params(labelright=True) axs[1].set_xlim([snd.xmin, snd.xmax]) #oscillogram plot axs[0].plot(snd.xs(), snd.values.T, linewidth=0.5) axs[0].set_xlim([snd.xmin, snd.xmax]) axs[0].set_ylabel("amplitude") #remove overlapping labels ticks = axs[0].yaxis.get_major_ticks() if len(ticks): ticks[0].label1.set_visible(False) if len(ticks) > 1: ticks[1].label1.set_visible(False) fig.tight_layout() return fig segments = segments.to_dict(orient="records") chunks = [] recording = segments[0]["recording_filename"] source = self.project.get_recording_path(recording, self.profile) audio = AudioSegment.from_file(source) print("extracting chunks from {}...".format(source)) for segment in segments: original_onset = int(segment["segment_onset"]) original_offset = int(segment["segment_offset"]) onset = original_onset offset = original_offset if self.chunks_length > 0: onset, offset = pad_interval(onset, offset, self.chunks_length, self.chunks_min_amount) if onset < 0: print("skipping chunk with negative onset ({})".format( onset)) continue intervals = [(a, a + self.chunks_length) for a in range(onset, offset, self.chunks_length)] else: intervals = [(onset, offset)] for (onset, offset) in intervals: chunk = Chunk( segment["recording_filename"], onset, offset, original_onset, original_offset, ) chunk_audio = audio[chunk.onset:chunk.offset].fade_in( 10).fade_out(10) wav = os.path.join(self.destination, "chunks", chunk.getbasename("wav")) mp3 = os.path.join(self.destination, "chunks", chunk.getbasename("mp3")) if os.path.exists(wav) and os.path.getsize(wav) > 0: print( "{} already exists, exportation skipped.".format(wav)) else: chunk_audio.export(wav, format="wav") if os.path.exists(mp3) and os.path.getsize(mp3) > 0: print( "{} already exists, exportation skipped.".format(mp3)) else: chunk_audio.export(mp3, format="mp3") if self.spectro: png = os.path.join(self.destination, "chunks", chunk.getbasename("png")) #convert pydub sound data into raw data that the parselmouth library can use bit_depth = chunk_audio.sample_width * 8 array_type = get_array_type(bit_depth) sound = array.array(array_type, chunk_audio._data) sr = chunk_audio.frame_rate fig = _create_spectrogram(sound, sr) #create the plot figure if os.path.exists(png) and os.path.getsize(png) > 0: print("{} already exists, exportation skipped.".format( png)) else: fig.savefig(png) plt.close(fig) chunks.append(chunk) return chunks
def getAudioData(name): ''' sound._data is a bytestring. I'm not sure what input Mpm expects, but you may need to convert the bytestring to an array like so: ''' sound = AudioSegment.from_mp3(retrieveBeat[name]) bytes_per_sample = sound.sample_width #1 means 8 bit, 2 meaans 16 bit print("BYTES PER SAMPLE: ") print(bytes_per_sample) bit_depth = sound.sample_width * 8 frame_rate = sound.frame_rate print("FRAME RATE IS: " + str(frame_rate)) number_of_frames_in_sound = sound.frame_count() number_of_frames_in_sound_200ms = sound.frame_count(ms=200) print("NUMBER OF FRAMES IS " + str(number_of_frames_in_sound)) print("NUMBER OF FRAMES IN SOUND PER 200 MS: " + str(number_of_frames_in_sound_200ms)) array_type = get_array_type(bit_depth) print(array_type) numeric_array = array.array(array_type, sound.raw_data) channel_count = sound.channels print("Number of channels in the audio is: ") print(channel_count) #audio get array of samples samples = sound.get_array_of_samples() print("SAMPLES ARE") print(len(samples)) left_sound, right_sound = sound.split_to_mono() #Split it print("FRAMES IN LEFT SOUND " + str(left_sound.frame_count())) print("FRAMES IN Right SOUND " + str(right_sound.frame_count())) print("LEngth of sample left: " + str(len(left_sound.get_array_of_samples()))) print("LEngth of sample right: " + str(len(left_sound.get_array_of_samples()))) #number_of_frames_in_sound_for_every_20s = sound.frame_count(ms=20000) #print("length of song is: " + str(len(samples)/number_of_frames_in_sound_for_every_20s * 20) + " seconds") ''' COLLECTED DATA: BYTES PER SAMPLE: 2 FRAME RATE IS: 48000 NUMBER OF FRAMES IS 7688495.0 NUMBER OF FRAMES IN SOUND PER 200 MS: 9600.0 h Number of channels in the audio is: 2 SAMPLES ARE 15376990 FRAMES IN LEFT SOUND 7688495.0 FRAMES IN Right SOUND 7688495.0 LEngth of sample left: 7688495 LEngth of sample right: 7688495 15376990 ''' counter = 0 for i in range(0, len(samples) - 1): if (samples[i] + counter < 10000): samples[i] += counter if (counter < 500): counter += 2 else: counter = 0 #print(samples[i]) # if(i % 2 == 0): # samples[i] = samples[len(samples) - i] #int(samples[i]/2) # else: # samples[i] = samples[len(samples) - i] #int(samples[i] - 0.7*samples[i]) #samples[i] = 10000 #This mutes the sound #samples[i+1] = 500 new_sound = sound._spawn(samples) new_sound.export("aaay", format='mp3') ''' note that when using numpy or scipy you will need to convert back to an array before you spawn: import array import numpy as np from pydub import AudioSegment sound = AudioSegment.from_file(“sound1.wav”) samples = sound.get_array_of_samples() shifted_samples = np.right_shift(samples, 1) # now you have to convert back to an array.array shifted_samples_array = array.array(sound.array_type, shifted_samples) new_sound = sound._spawn(shifted_samples_array) ''' return numeric_array
def get_array_from_pydub_obj(pydub_obj): bit_depth = pydub_obj.sample_width * 8 # 带宽 eg 2*8 array_type = get_array_type(bit_depth) numeric_array = array.array(array_type, pydub_obj._data) return numeric_array
# decoding decoded_bytes = dft_decode( DFT_marked_with_audio, { 'type': 'DFT', 'key': { 'random_key': DFT_marked_with_audio.key['key']['random_key'], 'original_audio': sound } }) mark_sample_width = DFT_marked_with_audio.key['key']['metadata'][ 'sample_width'] mark_frame_rate = DFT_marked_with_audio.key['key']['metadata']['frame_rate'] mark_channels = DFT_marked_with_audio.key['key']['metadata']['channels'] mark_tags = mark.tags decoded_samples = array(get_array_type(mark_sample_width * 8), decoded_bytes) # should be packaged into Audio audiowrite(decoded_samples, DFT_marked_with_audio.key, mark_frame_rate, mark_sample_width, mark_channels, "./test/DFT_AUDIO.json", "./test/extracted.flac", "flac", mark_tags) # calculate BER from adwtmk.utilities import get_all_bits total_bits = get_all_bits(decoded_bytes) total_bits_len = len(total_bits) BER = np.sum( np.array( np.array(total_bits) -
#!/usr/bin/env python3 import numpy from pydub import AudioSegment from pydub.utils import get_array_type import sys if __name__ == '__main__': try: inputfile = sys.argv[1] outputfile = sys.argv[2] except IndexError: print('usage: get_raw_audio.py infile.ext outfile.txt', file=sys.stderr) sys.exit(1) sound = AudioSegment.from_file( file=inputfile, format=inputfile.split('.')[-1]).set_channels(1) sound_raw = numpy.frombuffer(sound._data, dtype=get_array_type(sound.sample_width * 8)) print('sample rate: {0}'.format(sound.frame_rate), file=sys.stderr) with open(outputfile, 'w') as fwrite: for sample in sound_raw: fwrite.write('{0}\n'.format(sample))