def hear(self, title=None): if title is not None: print("Label:", title) if self.start is not None or self.end is not None: print( f"{round(self.start / self.sr, 2)}s-{round(self.end / self.sr, 2)}s of original clip" ) start = 0 if self.start is None else self.start end = len(self.sig) - 1 if self.end is None else self.end display( Audio(data=self.data_signal[start:end], rate=self.sample_rate)) else: display(self.ipy_audio)
def play_transformed(self): if (self.recon is None): print "No reconstructed waveform" return return Audio(self.recon, rate=self.sr)
def play_original(self): return Audio(self.orig, rate=self.sr)
def make_audio(self): """Makes an IPython Audio object. """ audio = Audio(data=self.ys.real, rate=self.frame_rate) return audio
def ipy_audio(self): return Audio(data=self.data_signal, rate=self.sample_rate)
def audio_creator(): CHUNK = 1024 FORMAT = pyaudio.paInt16 CHANNELS = 2 RATE = 44100 RECORD_SECONDS = 5 WAVE_OUTPUT_FILENAME = "data1.wav" BAG_INPUT_FILENAME = 'data1.bag' WINLEN = float(RATE) / float(CHUNK) #WINLEN=43/second data_store = [] time_store = [] for topic, msg, t in rosbag.Bag(BAG_INPUT_FILENAME).read_messages(): if msg._type == 'hrl_anomaly_detection/audio': data_store.append(np.array(msg.audio_data, dtype=np.int16)) time_store.append(t) # print data_store # print "\n" # print time_store # print "\n" #copy the frame and insert to lengthen data_store_long = [] baglen = len(data_store) num_frames = RATE / CHUNK * RECORD_SECONDS recovered_len = num_frames / baglen mfcc_store = [] for frame in data_store: for i in range(0, recovered_len): data_store_long.append(frame) # audio_mfcc = mfcc(frame, samplerate=RATE, nfft=CHUNK, winlen=WINLEN).tolist()[0] # mfcc_store.append(audio_mfcc) # print audio_mfcc # print "\n" # mfccdata = np.hstack(mfcc_store) # mfccdata = np.reshape(mfccdata, (len(mfccdata)/CHANNELS, CHANNELS)) # mfccdata = mfcc(numpydata, samplerate=RATE, nfft=CHUNK, winlen=WINLEN).tolist()[0] # wav.write('data1mfcc_speachfe.wav', RATE, mfccdata) numpydata = np.hstack(data_store_long) # mfccdata = mfcc(numpydata, samplerate=RATE, nfft=CHUNK, winlen=WINLEN).tolist()[0] # wav.write('data1mfcc_speachfe.wav', RATE, mfccdata) #audio_mfcc = mfcc(numpydata, samplerate=RATE, nfft=CHUNK, winlen=WINLEN).tolist()[0] #print "audio mfcc" #print audio_mfcc #print "\n" #read WAV - Using numpydata works!(bad quality) but data read from WAV doesn't work. #Invalid value encountered -- 0. ??? filename = "data1.wav" y, sr = librosa.load(filename) # notsure why sr = RATE/2 #y = numpydata #sr = RATE #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ #^^^^^^^^^Converting to MFCC and reconstructing^^^^^^^^^^^^^^^^^^^^^ #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ #calculate mfccs Y = librosa.stft(y) #, n_fft=CHUNK, win_length=WINLEN) mfccs = librosa.feature.mfcc(y) #, sr=sr) print griffinlim(mfccs) # print "mfccs" # print mfccs #build reconstruction mappings n_mfcc = mfccs.shape[0] n_mel = 128 dctm = librosa.filters.dct(n_mfcc, n_mel) n_fft = 2048 mel_basis = librosa.filters.mel(sr, n_fft) #Empirical scaling of channels to get ~flat amplitude mapping. bin_scaling = 1.0 / np.maximum( 0.0005, np.sum(np.dot(mel_basis.T, mel_basis), axis=0)) #Reconstruct the approximate STFT squared-magnitude from the MFCCs. recon_stft = bin_scaling[:, np.newaxis] * np.dot( mel_basis.T, invlogamplitude(np.dot(dctm.T, mfccs))) #Impose reconstructed magnitude on white noise STFT. excitation = np.random.randn(y.shape[0]) E = librosa.stft(excitation) recon = librosa.istft(E / np.abs(E) * np.sqrt(recon_stft)) #Listen to the reconstruction. wav.write('data1mfcc.wav', sr, recon) Audio(recon, rate=sr) #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ numpydata = np.reshape(numpydata, (len(numpydata) / CHANNELS, CHANNELS))
outputs.append(last_output) sys.stdout.write('\n') def invlogamplitude(S): """librosa.logamplitude is actually 10_log10, so invert that.""" return 10.0*(S/10.0) # Reconstruct audio: # https://github.com/librosa/librosa/issues/424 mfccs = np.transpose(np.squeeze(np.concatenate(outputs, axis=1), 0)) n_mfcc = mfccs.shape[0] n_mel = 128 dctm = librosa.filters.dct(n_mfcc, n_mel) n_fft = 2048 sr = 22050 mel_basis = librosa.filters.mel(sr, n_fft) bin_scaling = 1.0/np.maximum(0.0005, np.sum(np.dot(mel_basis.T, mel_basis), axis=0)) recon_stft = bin_scaling[:, np.newaxis] * np.dot(mel_basis.T, invlogamplitude(np.dot(dctm.T, mfccs))) y_len = int(sr * 2.325) excitation = np.random.randn(y_len) E = librosa.stft(excitation) print(np.shape(recon_stft)) print(np.shape(excitation)) print(np.shape(E)) print(recon_stft) recon = librosa.istft(E/np.abs(E)*np.sqrt(recon_stft)) Audio(recon, rate=sr)