def combineandsplitsongs(path_to_audio, path_to_voice, path_of_output, path_to_output1, path_to_output2): song, sr = librosa.load(path_to_audio) voice, sr = librosa.load(path_to_voice) song[0:len(voice)] += voice librosa.output.write_wav(path_of_output, song, sr) split1len = splitty(song, path_to_output1, path_to_output2, sr) return split1len
def B3(): w = 2048 h = 1024 path = '../audio/train/vio/' vio_64 = librosa.load(path + 'vio_64.wav', fs)[0][0:61000] vio_88 = librosa.load(path + 'vio_88.wav', fs)[0][0:61000] cla_64 = librosa.load('../audio/train/cla/cla_64.wav', fs)[0][0:61000] S_1 = NMF.extractTemplate(vio_64) S_2 = NMF.extractTemplate(vio_88) S_3 = NMF.extractTemplate(cla_64) librosa.display.specshow(S_1, y_axis='cqt_note', x_axis='frames', n_yticks=180) plt.axis([0, 2, 0, 100]) plt.show() librosa.display.specshow(S_2, y_axis='cqt_note', x_axis='frames', n_yticks=10) #plt.axis([0, 2, 0, 100]) plt.show() librosa.display.specshow(S_3, y_axis='cqt_note', x_axis='frames', n_yticks=180) plt.axis([0, 2, 0, 100]) plt.show() S_1 = librosa.core.istft(S_1) librosa.display.waveplot(S_1, x_axis='time') plt.show() S_2 = librosa.core.istft(S_2) librosa.display.waveplot(S_2, x_axis='time') plt.show()
def run_nmf_mfcc_and_pickle_mir_evals(paths): """ :return: """ mix_path, (fore_name, back_name) = paths file_name = os.path.split(mix_path)[1] pickle_name = splitext(file_name)[0] pickle_output_path = os.path.join(output_folder, pickle_name) if os.path.exists(pickle_output_path): print(pickle_output_path + "exists! Skipping...") return back_path = os.path.join(background_folder, back_name) true_back = librosa.load(back_path)[0] fore_path = os.path.join(foreground_folder, fore_name) true_fore = librosa.load(fore_path)[0] estimated_sources, mfcc_clusters = do_nmf_and_clustering(mix_path, n_clusters) # transpose? min_len = np.min((len(true_fore), len(true_back), estimated_sources.shape[1])) true_back = true_back[:min_len] true_fore = true_fore[:min_len] true_srcs = np.vstack([true_back, true_fore]) estimated_sources = estimated_sources[:, :min_len] sdr_dict = run_bss_eval(true_srcs, estimated_sources) pickle_dict = {"file_name": file_name, "mfcc_clusters": mfcc_clusters, "sdr_dict": sdr_dict} pickle.dump(pickle_dict, open(os.path.join(output_folder, pickle_output_path + ".pick"), "wb")) print("pickled {}".format(pickle_name))
def B1(): fs = 44100 path = '../audio/validation/' a = librosa.load(path + '01_vio.wav', fs)[0] b = librosa.load(path + '01_cla.wav', fs)[0] c = librosa.load(path + '01_mix.wav', fs)[0] n = np.random.randn(len(a)) print(a.shape) #print(evalSDR(np.array([c, c])/2, np.array([a, b]))) # correct one print("SDR [c;c]/2, [a;b]") print(evalSDR(np.array([a, b]), np.array([c, c])/2)) print("SDR [a;b], [a;b]") print(evalSDR(np.array([a, b]), np.array([a, b]))) print("SDR [b;a], [a;b]") print(evalSDR(np.array([a, b]), np.array([b, a]))) print("SDR [2a;2b], [a;b]") print(evalSDR(np.array([a, b]), np.array([2*a, 2*b]))) print("SDR (a+0.01*n), a") print(evalSDR(a, (a + 0.01*n))) print("SDR (a+0.1*n), a") print(evalSDR(a, (a + 0.1*n))) print("SDR (a+n), a") print(evalSDR(a, (a + n))) print("SDR (a+0.01*b), a") print(evalSDR(a, (a + 0.01*b))) print("SDR (a+0.1*b), a") print(evalSDR(a, (a + 0.1*b))) print("SDR (a+b), a") print(evalSDR(a, (a + b)))
def mashability(song1, song2): """ Returns how well song1 transitions into song2 using cosine matrix similarity and FFT semitone bin approximation matrices """ # If the tempo differs by more than thirty then we should never make that transition if abs(song1.bpm - song2.bpm) > 30: return 1 sample_length = MIX_LENGTH # beats per sample beats1 = song1.AudioFile.analysis.beats[song1.mix_out : song1.mix_out + sample_length] beats2 = song2.AudioFile.analysis.beats[song1.mix_in : song1.mix_in + sample_length] data1 = audio.getpieces(song1.AudioFile, beats1) data2 = audio.getpieces(song2.AudioFile, beats2) data1.encode("temp1.mp3") data2.encode("temp2.mp3") y1, sr1 = librosa.load("temp1.mp3") y2, sr2 = librosa.load("temp2.mp3") S1 = np.abs(librosa.stft(y1, n_fft=4096)) chroma1 = librosa.feature.chroma_stft(S=S1, sr=sr1) S2 = np.abs(librosa.stft(y2, n_fft=4096)) chroma2 = librosa.feature.chroma_stft(S=S2, sr=sr2) # im = librosa.display.specshow(chroma1,x_axis = "time",y_axis = "chroma") # im2 = librosa.display.specshow(chroma2,x_axis = "time",y_axis = "chroma") # plt.show() orthogonal_arr = [] for i in range(min(chroma1.shape[1], chroma2.shape[1])): orthogonal_arr.append(dst.cosine(chroma1[:, i], chroma2[:, i])) return sum(orthogonal_arr) / len(orthogonal_arr)
def results(self): #Loading audio files #Extract MFCC features and use dtw to compare the distance between two MFCCs y1, sr1 = librosa.load('output1.wav') y2, sr2 = librosa.load('output2.wav') mfcc1 = librosa.feature.mfcc(y1,sr1) #Computing MFCC values mfcc2 = librosa.feature.mfcc(y2,sr2) dist, cost, path = dtw(mfcc1.T, mfcc2.T) #Set a threshold for our game's ranking system if dist <= 40: self.textEdit_2.setText("You did a great job! ^^") elif dist <= 50: self.textEdit_2.setText("You did good.") elif dist <= 60: self.textEdit_2.setText("You're fine.") else: self.textEdit_2.setText("You are poor at this game... TT") self.rank.append(dist) self.textEdit_3.setText(str(self.rank[self.count])) self.outputRank += "Player " + str(self.count) + " got " + str(self.rank[self.count]) + "\n\n" self.count = self.count + 1
def alignment_helper(file_list, target_path): """Downsample and perform cross-correlation on files relative to a target file to test if they are correctly aligned. Parameters ---------- file_list : list List of files (i.e. stem_files, raw_files) target_path : str Filepath to compare files in file_list to. Returns ------- status : bool True if the cross_correlation values are within a threshold, demonstrating that the files are correctly aligned. """ sr = 1000 output_handle = tempfile.NamedTemporaryFile(suffix='.wav') output_path = output_handle.name if len(file_list) > 1: file_sum = sox.Combiner() file_sum.build( file_list, output_path, 'mix' ) else: file_sum = sox.Transformer() file_sum.build(file_list[0], output_path) file_sum.rate(sr,'m') target_handle = tempfile.NamedTemporaryFile(suffix='.wav') target_handle_path = target_handle.name target_sum = sox.Transformer() target_sum.build(target_path, target_handle_path) target_sum.rate(sr, 'm') dur = get_length(target_path) offset = (dur/44100.0) / 2.0 y_files, sr = librosa.load(output_path, sr=sr, offset = offset, duration=30.0) y_target, sr = librosa.load(target_handle_path, sr=sr, offset = offset, duration=30.0) correlation = np.correlate(y_files, y_target, 'full') N = len(y_target) a = np.arange(1, N+1) a_rev = np.arange(1, N) b = a_rev[ : :-1] c = np.concatenate((a, b)) c = c.astype(float) correlation = np.abs(correlation) / c center = N corr_index = np.argmax(correlation) if np.abs(corr_index - center) > 5: return False else: return True
def loadAudio(mixedAudioPath,stemsPathList, sr = 44100): ''' Function to load mix and stem audio for a song. Input: - mixedAudioPath (str): file path for mixed audio - stemsPathList (list): list of file path for stems - sr (int): sample rate Returns: - mixAudio (np.Array): (,length of audio) - stemsAudio (np.Array): (num stems, length of audio) ''' mixAudio, sr = librosa.load(mixedAudioPath, sr) mixAudio = np.array(mixAudio).T # load stems stems = [] for path in stemsPathList: stems.append(librosa.load(path, sr)[0]) stemsAudio = np.array(stems).T return mixAudio, stemsAudio
def run(self): # Initialize signals self.percentDoneSignal.emit(0) percent_scale = 1000.0 / 5 self.doneSignal.emit(0) self.statusSignal.emit("") # Load in audio data self.statusSignal.emit("Loading {}".format(os.path.split(self.mix_file)[1])) mix, self.fs = librosa.load(self.mix_file, sr=None) self.percentDoneSignal.emit(1 * percent_scale) self.statusSignal.emit("Loading {}".format(os.path.split(self.source_file)[1])) source, self.fs = librosa.load(self.source_file, sr=self.fs) self.percentDoneSignal.emit(2 * percent_scale) # Fix any gross timing offset self.statusSignal.emit("Aligning...") mix, source = estimate.align(mix, source, self.fs) self.percentDoneSignal.emit(3 * percent_scale) self.statusSignal.emit("Subtracting...") source = estimate.reverse_channel(mix, source) mix, source = estimate.pad(mix, source) self.percentDoneSignal.emit(4 * percent_scale) self.statusSignal.emit("Enhancing...") self.subtracted = estimate.wiener_enhance(mix - source, source, self.wiener_threshold) self.percentDoneSignal.emit(5 * percent_scale) self.doneSignal.emit(1)
def read_audio(current_file, sample_rate=None, mono=True): """Read audio file Parameters ---------- current_file : dict Dictionary given by pyannote.database. sample_rate: int, optional Target sampling rate. Defaults to using native sampling rate. mono : int, optional Convert multi-channel to mono. Defaults to True. Returns ------- y : (n_samples, n_channels) np.array Audio samples. sample_rate : int Sampling rate. Notes ----- In case `current_file` contains a `channel` key, data of this (1-indexed) channel will be returned. """ # sphere files if current_file['audio'][-4:] == '.sph': # dump sphere file to a temporary wav file # and load it from here... from sphfile import SPHFile sph = SPHFile(current_file['audio']) with tempfile.NamedTemporaryFile() as f: sph.write_wav(f.name) y, sample_rate = librosa.load(f.name, sr=sample_rate, mono=False) # all other files else: y, sample_rate = librosa.load(current_file['audio'], sr=sample_rate, mono=False) # reshape mono files to (1, n) [was (n, )] if y.ndim == 1: y = y.reshape(1, -1) # extract specific channel if requested channel = current_file.get('channel', None) if channel is not None: y = y[channel - 1, :] # convert to mono if mono: y = librosa.to_mono(y) return y.T, sample_rate
def test_ndarray_to_file(): apply_audio_effects(mono, outfile) y = lr.load(outfile, sr=None)[0] lr.output.write_wav('test_ndarray_to_file_mono.wav', y, sr) assert lr.util.valid_audio(y) apply_audio_effects(stereo, outfile) y = lr.load(outfile, sr=None, mono=False)[0] lr.output.write_wav('test_ndarray_to_file_stereo.wav', y, sr) assert lr.util.valid_audio(y, mono=False)
def test_segment_load(): """ Test loading a segment. Check size accuracy """ sample_len = 2003 y, sr = librosa.load('data/test1_44100.wav', sr=None, mono=False, offset=0., duration=sample_len/44100.) assert y.shape[1] == sample_len y, sr = librosa.load('data/test1_44100.wav', sr=None, mono=False, offset=2048/44100., duration=1.0) assert y.shape[1] == 44100
def remove_umms(ns): if ns.output is None: ns.output = '{0}-umdone{1}'.format(*os.path.splitext(ns.input)) x, sr = librosa.load(ns.input, mono=True, sr=None) bounds = segment.boundaries(x, sr, window_length=ns.window_length, threshold=ns.noise_threshold) mfccs, distances, categories = umdone.io.load(ns.train) matches = discover.match(x, sr, bounds, mfccs, distances, categories) del x, sr, bounds, mfccs, distances, categories # read back in to preserve mono/stereo and levels on output x, sr = librosa.load(ns.input, mono=False, sr=None) y = segment.remove_slices(x.T, matches) librosa.output.write_wav(ns.output, y.T, sr, norm=False)
def plot(self): ''' plot sound wave ''' # fs, x = wavread('output1.wav') y1, sr1 = librosa.load('output1.wav') y2, sr2 = librosa.load('output2.wav') self.figure.set_size_inches(4.5, 2.0) #plt.axis('off') ax1 = self.figure.add_subplot(2,1,1) ax1.axis('off') ax1.plot(y1) self.canvas.draw()
def all_repet_params(fg_input_directory, fg_file_name_base, bg_input_directory, bg_file_name_base, output_directory, sample_rate): ''' Creates all combinations of foreground and background files and runs a large series of REPET parameters on them Parameters ---------- fg_input_directory : str input directory fg_file_name_base : str the base string for a file name bg_input_directory : str input directory bg_file_name_base : str the base string for a file name output_directory : str where the results will be stored sample_rate : int sample rate in number of samples per seconds ''' window_sizes = [256, 512, 1024, 2048, 4096, 8192, 16384] window_types = [nussl.WindowType.HAMMING, nussl.WindowType.RECTANGULAR, nussl.WindowType.HANN, nussl.WindowType.BLACKMAN] for i in range(0, 5): for j in range(1, 5): fg_file_name = fg_file_name_base + ('%02d.wav' % i) bg_file_name = bg_file_name_base + ('%02d.wav' % j) fg, sr = librosa.load(os.path.join(fg_input_directory, fg_file_name), sr=sample_rate) bg, sr = librosa.load(os.path.join(bg_input_directory, bg_file_name), sr=sample_rate) bg = bg[:fg.shape[0]] mix = fg + bg # create the directory for the output files # new_directory = os.path.join(output_directory, 'fg-%02d-bg-%02d' % (i, j)) # if not os.path.exists(new_directory): # os.makedirs(new_directory) for window_size in window_sizes: for window_type in window_types: bg_simple, fg_simple, bg_complex, fg_complex = run_repet(mix, window_size=window_size, window_type=window_type) fg_simple_result = mir_eval.separation.bss_eval_sources(fg, fg_simple.audio_data) bg_simple_result = mir_eval.separation.bss_eval_sources(bg, bg_simple.audio_data) fg_complex_result = mir_eval.separation.bss_eval_sources(fg, fg_complex.audio_data) bg_complex_result = mir_eval.separation.bss_eval_sources(bg, bg_complex.audio_data) print 'window_size: ', window_size, 'window_type: ', window_type, 'simple_bg_sdr: ', bg_simple_result[0][0], 'simple_fg_sdr: ', fg_simple_result[0][0], 'complex_bg_sdr: ', bg_complex_result[0][0], 'complex_fg_sdr: ', fg_complex_result[0][0]
def all_nearest_neighbor(fg_input_directory, fg_file_name_base, bg_input_directory, bg_file_name_base, output_directory, sample_rate): ''' Creates all combinations of foreground and background files and runs a large series of REPET parameters on them fg_input_directory : str input directory fg_file_name_base : str the base string for a file name bg_input_directory : str input directory bg_file_name_base : str the base string for a file name output_directory : str where the results will be stored sample_rate : int sample rate in number of samples per seconds ''' window_sizes = [256, 512, 1024, 2048, 4096, 8192] window_types = [nussl.WindowType.HAMMING, nussl.WindowType.RECTANGULAR, nussl.WindowType.HANN, nussl.WindowType.BLACKMAN] # window_sizes = [256] # window_types = [nussl.WindowType.HAMMING] # period = [i for i in range(0, 1000)] for i in range(0, 1): for j in range(1, 2): fg_file_name = fg_file_name_base + ('%02d.wav' % i) bg_file_name = bg_file_name_base + ('%02d.wav' % j) fg, sr = librosa.load(os.path.join(fg_input_directory, fg_file_name), sr=sample_rate) bg, sr = librosa.load(os.path.join(bg_input_directory, bg_file_name), sr=sample_rate) bg = bg[:fg.shape[0]] mix = fg + bg for window_size in window_sizes: for window_type in window_types: bs, suggested_period = compute_beat_spectrum_and_suggested_period(mix, window_size=window_size, window_type=window_type) period_min = suggested_period / 8 period_max = suggested_period sd = beat_spectrum_std(bs) tempo, beats = librosa.beat.beat_track(mix) periods = [suggested_period / 8, suggested_period / 7, suggested_period / 6, suggested_period / 5, suggested_period / 4, suggested_period / 3, suggested_period / 2, suggested_period] for period in periods: values = (window_size, window_type, period, sd, tempo, fg_file_name, bg_file_name, period_min, period_max, suggested_period) print values insert_nearest_neighbors(values)
def estimateValidSet(path, vio_W, cla_W, score_inf=None): for i in range(0, 5): p = path + '0' x = str(i + 1) print('round : ' + x) valid_v = librosa.load(p + x + '_vio.wav', 44100)[0] valid_c = librosa.load(p + x + '_cla.wav', 44100)[0] valid_m = librosa.load(p + x + '_mix.wav', 44100)[0] sc = None if score_inf is not None: sc = score_inf[i] estimate(valid_m, vio_W, cla_W, valid_v, valid_c, x, sc)
def get_features(collection='drum_samples_train', features=('zero_crossing_rate', 'spectral_centroid'), scaler=None): if collection == 'drum_samples_train': kick_filepaths, snare_filepaths = download_samples('drum_samples_train') kick_signals = [ librosa.load(p)[0] for p in kick_filepaths ] snare_signals = [ librosa.load(p)[0] for p in snare_filepaths ] kick_features = numpy.array([extract_features(x, features) for x in kick_signals]) snare_features = numpy.array([extract_features(x, features) for x in snare_signals]) feature_table = numpy.vstack((kick_features, snare_features)) if scaler is None: scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(-1, 1)) scaler.fit(feature_table) training_features = scaler.transform(feature_table) kick_labels = numpy.zeros(10) snare_labels = numpy.ones(10) training_labels = numpy.concatenate((kick_labels, snare_labels)) return training_features, training_labels, scaler elif collection == 'drum_samples_test': kick_filepaths, snare_filepaths = download_samples('drum_samples_test') kick_signals = [ librosa.load(p)[0] for p in kick_filepaths ] snare_signals = [ librosa.load(p)[0] for p in snare_filepaths ] kick_features = numpy.array([extract_features(x, features) for x in kick_signals]) snare_features = numpy.array([extract_features(x, features) for x in snare_signals]) feature_table = numpy.vstack((kick_features, snare_features)) if scaler is None: scaler = sklearn.preprocessing.MinMaxScaler(feature_range=(-1, 1)) scaler.fit(feature_table) test_features = scaler.transform(feature_table) kick_labels = numpy.zeros(30) snare_labels = numpy.ones(30) labels = numpy.concatenate((kick_labels, snare_labels)) return test_features, labels, scaler
def test_resample_mono(): def __test(y, sr_in, sr_out, res_type, fix): y2 = librosa.resample(y, sr_in, sr_out, res_type=res_type, fix=fix) # First, check that the audio is valid librosa.util.valid_audio(y2, mono=True) # If it's a no-op, make sure the signal is untouched if sr_out == sr_in: assert np.allclose(y, y2) # Check buffer contiguity assert y2.flags['C_CONTIGUOUS'] # Check that we're within one sample of the target length target_length = y.shape[-1] * sr_out // sr_in assert np.abs(y2.shape[-1] - target_length) <= 1 for infile in ['data/test1_44100.wav', 'data/test1_22050.wav', 'data/test2_8000.wav']: y, sr_in = librosa.load(infile, sr=None, duration=5) for sr_out in [8000, 22050]: for res_type in ['kaiser_best', 'kaiser_fast', 'scipy']: for fix in [False, True]: yield (__test, y, sr_in, sr_out, res_type, fix)
def test_tonnetz(): y, sr = librosa.load(librosa.util.example_audio_file()) tonnetz_chroma = np.load(os.path.join('tests', "data", "feature-tonnetz-chroma.npy")) tonnetz_msaf = np.load(os.path.join('tests', "data", "feature-tonnetz-msaf.npy")) # Use cqt chroma def __audio(): tonnetz = librosa.feature.tonnetz(y=y, sr=sr) assert tonnetz.shape[0] == 6 # Use pre-computed chroma def __stft(): tonnetz = librosa.feature.tonnetz(chroma=tonnetz_chroma) assert tonnetz.shape[1] == tonnetz_chroma.shape[1] assert tonnetz.shape[0] == 6 assert np.allclose(tonnetz_msaf, tonnetz) def __cqt(): # Use high resolution cqt chroma chroma_cqt = librosa.feature.chroma_cqt(y=y, sr=sr, n_chroma=24) tonnetz = librosa.feature.tonnetz(chroma=chroma_cqt) assert tonnetz.shape[1] == chroma_cqt.shape[1] assert tonnetz.shape[0] == 6 # Using stft chroma won't generally match cqt chroma # skip the equivalence check # Call the function with not enough parameters yield pytest.mark.xfail(librosa.feature.tonnetz, raises=librosa.ParameterError) yield __audio yield __stft yield __cqt
def test_ifgram_matches_stft(): y, sr = librosa.load('data/test1_22050.wav') def __test(n_fft, hop_length, win_length, center, norm, dtype): D_stft = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=center, dtype=dtype) _, D_ifgram = librosa.ifgram(y, sr, n_fft=n_fft, hop_length=hop_length, win_length=win_length, center=center, norm=norm, dtype=dtype) if norm: # STFT doesn't do window normalization; # let's just ignore the relative scale to make this easy D_stft = librosa.util.normalize(D_stft, axis=0) D_ifgram = librosa.util.normalize(D_ifgram, axis=0) assert np.allclose(D_stft, D_ifgram) for n_fft in [1024, 2048]: for hop_length in [None, n_fft // 2, n_fft // 4]: for win_length in [None, n_fft // 2, n_fft // 4]: for center in [False, True]: for norm in [False, True]: for dtype in [np.complex64, np.complex128]: yield (__test, n_fft, hop_length, win_length, center, norm, dtype)
def __test(res_type): y_native, sr = librosa.load(librosa.util.example_audio_file(), sr=None, offset=offset, duration=duration, res_type=res_type) y2 = librosa.resample(y_native, sr, sr_target, res_type=res_type) y, _ = librosa.load(librosa.util.example_audio_file(), sr=sr_target, offset=offset, duration=duration, res_type=res_type) assert np.allclose(y2, y)
def save_background(input_path, output_path, sample_rate, length=0, number_of_repeating_segments=0): ''' Stitch together wave files to a specified length Parameters ---------- input_path : str the path of the input file length : int length in seconds of the output file output_path : str the path of the output file ''' print 'loading bg file...' bg, sample_rate = librosa.load(input_path, sr=sample_rate) print 'stitching bg file...' if length > 0: bg_length = bg.shape[0] / sample_rate number_of_segments = int(np.ceil(length / bg_length)) elif number_of_repeating_segments > 0: number_of_segments = number_of_repeating_segments else: print 'a length or number of repeating segments must be specified' return result = bg for i in range(0, number_of_segments): result = np.concatenate((bg, result)) print 'writing bg...' wavwrite(output_path, result, sample_rate)
def __test_consistency(frame_length, hop_length, center): y, sr = librosa.load(__EXAMPLE_FILE, sr=None) # Ensure audio is divisible into frame size. y = librosa.util.fix_length(y, y.size - y.size % frame_length) assert y.size % frame_length == 0 # STFT magnitudes with a constant windowing function and no centering. S = librosa.magphase(librosa.stft(y, n_fft=frame_length, hop_length=hop_length, window=np.ones, center=center))[0] # Try both RMS methods. rms1 = librosa.feature.rms(S=S, frame_length=frame_length, hop_length=hop_length) rms2 = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length, center=center) assert rms1.shape == rms2.shape # Normalize envelopes. rms1 /= rms1.max() rms2 /= rms2.max() # Ensure results are similar. np.testing.assert_allclose(rms1, rms2, rtol=5e-2)
def main(): args = get_arguments() log.basicConfig(format='[%(asctime)s] [%(levelname)s] %(message)s', level=LOG_LEVEL) log.info("Start of '{}'.".format(__file__)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) log.info("Created output directory '{}'.".format(args.output_dir)) for f in find_files(args.input_dir, '*.mid', path=False): midi_old = pretty_midi.PrettyMIDI( os.path.join(args.input_dir, f)) midi_new = trim_midi_file(args.sec_from, args.sec_to, midi_old) midi_new.write( os.path.join(args.output_dir, f)) log.info("Processed file '{}'".format(f)) for f in find_files(args.input_dir, '*.wav', path=False): audio_old, sr = librosa.load(os.path.join(args.input_dir, f)) audio_new = trim_audio_file(args.sec_from, args.sec_to, audio_old, sr) librosa.output.write_wav( os.path.join(args.output_dir, f), audio_new, sr) log.info("Processed file '{}'".format(f)) log.info("End of '{}'.".format(__file__))
def analyze(self, songname,songpath, setName): # Step one. Read in an audio file, extract all features for the song, # extract phrase boundaries, and store features into small seg object y, sr = librosa.load(songpath) # Separate harmonics and percussives into two waveforms y_harmonic, y_percussive = librosa.effects.hpss(y) # Beat track on the percussive signal tempo, beat_frames = librosa.beat.beat_track(y=y, sr = sr) # Step two. Get phrase boundary # Extract Phrase info TO-DO: automatically phraseLabName = '../dataset/Annotations/myphraselab/%s/%s.txt' % (setName, songname) phraseBound = [float(l) for l in open(phraseLabName).read().splitlines()] for v in range(0,len(phraseBound)-1): # start time, and end time in second st = int(phraseBound[v] * 1. * sr) ed = int(phraseBound[v+1] * 1. * sr) currSeg = Seg(songname, st, ed, y[st:ed], y_harmonic[st:ed], y_percussive[st:ed], sr) currSeg.get_seg_feature() currSeg.idx = self.countSeg self.countSeg += 1 self.segments.append(currSeg) self.mfcc_size = currSeg.mfcc_size self.chroma_size = currSeg.chroma_size self.rms_size = 1
def __init__(self, folder, transform=None, classes=CLASSES, silence_percentage=0.1, sample_rate=16000): all_classes = [d for d in os.listdir(folder) if os.path.isdir(os.path.join(folder, d)) and not d.startswith( '_')] for c in classes[2:]: assert c in all_classes class_to_idx = {classes[i]: i for i in range(len(classes))} for c in all_classes: if c not in class_to_idx: print ("Class ", c, "assigned as unknown") class_to_idx[c] = 0 data = [] for c in all_classes: d = os.path.join(folder, c) target = class_to_idx[c] for f in os.listdir(d): path = os.path.join(d, f) samples, sample_rate = librosa.load(path, sr=sample_rate) audio = {'samples': samples, 'sample_rate': sample_rate} data.append((audio, target)) # add silence target = class_to_idx['silence'] samples = np.zeros(sample_rate, dtype=np.float32) silence = {'samples': samples, 'sample_rate': sample_rate} data += [(silence, target)] * int(len(data) * silence_percentage) self.classes = classes self.data = data self.transform = transform
def test_piptrack_properties(): def __test(S, n_fft, hop_length, fmin, fmax, threshold): pitches, mags = librosa.core.piptrack(S=S, n_fft=n_fft, hop_length=hop_length, fmin=fmin, fmax=fmax, threshold=threshold) # Shape tests eq_(S.shape, pitches.shape) eq_(S.shape, mags.shape) # Make sure all magnitudes are positive assert np.all(mags >= 0) # Check the frequency estimates for bins with non-zero magnitude idx = (mags > 0) assert np.all(pitches[idx] >= fmin) assert np.all(pitches[idx] <= fmax) # And everywhere else, pitch should be 0 assert np.all(pitches[~idx] == 0) y, sr = librosa.load('data/test1_22050.wav') for n_fft in [2048, 4096]: for hop_length in [None, n_fft // 4, n_fft // 2]: S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length)) for fmin in [0, 100]: for fmax in [4000, 8000, sr // 2]: for threshold in [0.1, 0.2, 0.5]: yield __test, S, n_fft, hop_length, fmin, fmax, threshold
def build_output(times, quantized_times, labels, kit, file_path, inputLength, quantized=False): # check for empty arrays if not times or not labels: return False labels = [label[0] for label in labels] # replace beatbox with drums drums = [] label_to_kit = {} for label in labels: if label in label_to_kit: drum = label_to_kit[label] else: drum, ssr = librosa.load('../kits/'+kit+'/'+label+'.wav', sr=None) label_to_kit[label] = drum drums.append(drum) # reconstruct signal from replaced sounds if quantized: result = reconstructor.replace(quantized_times, drums, ssr, inputLength) else: result = reconstructor.replace(times, drums, ssr, inputLength) # write output signal to .wav librosa.output.write_wav(file_path[:-4]+'-out.wav', result, ssr) return result, ssr
def process_one_file(mp3_filename, skip=True): ''' Load in an mp3, get the features, and write the features out :parameters: - mp3_filename : str Path to an mp3 file - skip : bool Whether to skip files when the h5 already exists ''' # h5 files go in the 'h5' dir instead of 'mp3' output_filename = mp3_filename.replace('mp3', 'h5') # Skip files already created if skip and os.path.exists(output_filename): return try: # Load audio and compute CQT audio_data, _ = librosa.load( mp3_filename, sr=feature_extraction.AUDIO_FS) cqt = feature_extraction.audio_cqt(audio_data) # Create subdirectories if they don't exist if not os.path.exists(os.path.split(output_filename)[0]): os.makedirs(os.path.split(output_filename)[0]) # Save CQT deepdish.io.save(output_filename, {'gram': cqt}) except Exception as e: print "Error processing {}: {}".format( mp3_filename, traceback.format_exc(e))
def compute_STFT_data_from_file_list(wavfile_list, fs=16000, wlen_sec=0.032, hop_percent=0.5, zp_percent=0, trim=False, top_db=60, out_file=None): """ Compute short-term Fourier transform (STFT) power and phase spectrograms from a list of wav files, and save them to a pickle file. Parameters ---------- wavfile_list List of wav files fs Sampling rate wlen_sec STFT window length in seconds hop_percent Hop size as a percentage of the window length zp_percent Zero-padding size as a percentage of the window length trim Boolean indicating if leading and trailing silences should be trimmed top_db The threshold (in decibels) below reference to consider as silence (see librosa doc) out_file Path to the pickle file for saving the data Returns ------- data A list of dictionaries, the length of the list is the same as 'wavfile_list' Each dictionary has the following fields: 'file': The wav file name 'power_spectrogram': The power spectrogram 'phase_spectrogram': The phase spectrogram Examples -------- fs = 16e3 # Sampling rate wlen_sec = 64e-3 # STFT window length in seconds hop_percent = 0.25 # hop size as a percentage of the window length trim=False data_folder = '/local_scratch/sileglai/datasets/clean_speech/TIMIT/TEST' test_file_list = librosa.util.find_files(data_folder, ext='wav') data = compute_data(test_file_list, fs=fs, wlen_sec=wlen_sec, hop_percent=hop_percent, trim=trim, zp_percent=0, out_file='test_compute_data.pckl') """ # STFT parameters wlen = wlen_sec * fs # window length of 64 ms wlen = np.int(np.power(2, np.ceil(np.log2(wlen)))) # next power of 2 hop = np.int(hop_percent * wlen) # hop size nfft = wlen + zp_percent * wlen # number of points of the discrete Fourier transform win = np.sin(np.arange(.5, wlen - .5 + 1) / wlen * np.pi) # sine analysis window fs_orig = librosa.load(wavfile_list[0], sr=None)[1] # Get sampling rate data = [None] * len( wavfile_list) # Create an empty list that will contain dictionaries for n, wavfile in enumerate(wavfile_list): path, file_name = os.path.split(wavfile) if fs == fs_orig: x = librosa.load(wavfile, sr=None)[0] # Load wav file without resampling else: print('resampling while loading with librosa') x = librosa.load(wavfile, sr=fs)[0] # Load wav file with resampling if trim: x = librosa.effects.trim( x, top_db=top_db)[0] # Trim leading and trailing silences T_orig = len(x) x_pad = librosa.util.fix_length( x, T_orig + wlen // 2) # Padding for perfect reconstruction (see librosa doc) X = librosa.stft(x_pad, n_fft=nfft, hop_length=hop, win_length=wlen, window=win) # STFT X_abs_2 = np.abs(X)**2 # Power spectrogram X_angle = np.angle(X) data[n] = { 'file': file_name, 'power_spectrogram': X_abs_2, 'phase_spectrogram': X_angle } f = open(out_file, 'wb') pickle.dump([data, fs, wlen_sec, hop_percent, trim], f) f.close() return data
def compute_STFT_data_from_file_list_TIMIT(wavfile_list, fs=16000, wlen_sec=0.032, hop_percent=0.5, zp_percent=0, trim=False, verbose=False, out_file=None): """ Same as 'compute_STFT_data_from_file_list' function except that specific fields related to TIMIT are added to the returned and saved dictionaries. """ # STFT parameters wlen = wlen_sec * fs # window length of 64 ms wlen = np.int(np.power(2, np.ceil(np.log2(wlen)))) # next power of 2 hop = np.int(hop_percent * wlen) # hop size nfft = wlen + zp_percent * wlen # number of points of the discrete Fourier transform win = np.sin(np.arange(.5, wlen - .5 + 1) / wlen * np.pi) # sine analysis window fs_orig = librosa.load(wavfile_list[0], sr=None)[1] # Get sampling rate data = [None] * len( wavfile_list) # Create an empty list that will contain dictionaries for n, wavfile in enumerate(wavfile_list): path, file_name = os.path.split(wavfile) path, speaker = os.path.split(path) path, dialect = os.path.split(path) path, set_type = os.path.split(path) if verbose: print('processing %s/%s/%s/%s\n' % (set_type, dialect, speaker, file_name)) if fs == fs_orig: x = librosa.load(wavfile, sr=None)[0] # Load wav file without resampling else: print('resampling while loading with librosa') x = librosa.load(wavfile, sr=fs)[0] # Load wav file with resampling if trim: with open( os.path.join(path, set_type, dialect, speaker, file_name[:-4] + '.PHN'), 'r') as f: first_line = f.readline() # Read the first line for last_line in f: # Loop through the whole file reading it all pass if not ('#' in first_line) or not ('#' in last_line): raise NameError( 'The first or last lines of the .phn file should contain #' ) ind_beg = int(first_line.split(' ')[1]) ind_end = int(last_line.split(' ')[0]) x = x[ind_beg:ind_end] T_orig = len(x) x_pad = librosa.util.fix_length( x, T_orig + wlen // 2) # Padding for perfect reconstruction (see librosa doc) X = librosa.stft(x_pad, n_fft=nfft, hop_length=hop, win_length=wlen, window=win) # STFT X_abs_2 = np.abs(X)**2 # Power spectrogram X_angle = np.angle(X) data[n] = { 'set': set_type, 'dialect': dialect, 'speaker': speaker, 'file': file_name, 'power_spectrogram': X_abs_2, 'phase_spectrogram': X_angle } f = open(out_file, 'wb') pickle.dump([data, fs, wlen_sec, hop_percent, trim], f) f.close() return data
# In[12]: #####################################################descarga video de youtube from pytube import YouTube yt = YouTube('https://www.youtube.com/watch?v=vPaBI_IQoJk&ab_channel=FuerzaPopular') stream = yt.streams.get_by_itag('251') stream.download() # In[10]: import librosa data, fs = librosa.load('pasos/keiko.wav') print(data) print("####################################################") print(fs) # In[2]: ##########################################################segmentar audio from pydub import AudioSegment t1 = 3 * 1000 #Works in milliseconds t2 = 6 * 1000 newAudio = AudioSegment.from_wav("pasos/keiko.wav") newAudio = newAudio[t1:t2] newAudio.export('pasos/newSong.wav', format="wav") #Exports to a wav file in the current path.
def read_audio_spectrum(filename): signal, fs = librosa.load(filename) S = librosa.stft(signal, N_FFT) final = np.log1p(np.abs(S[:, :430])) return final, fs
def load_wav(filename, sample_rate): audio, sr = librosa.load(filename, sr=sample_rate, mono=True) audio = audio.flatten() return audio
conn = sqlite3.connect(database) c = conn.cursor() print("Database will be saved as: {}".format(database)) print("Noisegroup of collected MFCCs: {}".format(noisegroup)) print("Noise wavefile: {}".format(environment_noise)) print("Number of MFCCs to be extracted: {}".format(num_mfcc)) check_variables = input("\nIMPORTANT!!!!\nAre the items listed above correct? (Y or N): ") if 'y' in check_variables.lower(): #load environment noise to be added to training data if environment_noise: try: env_noise = librosa.load(environment_noise)[0] except FileNotFoundError as fnf: print("\nCannot find {} in cwd.\n".format(environment_noise)) raise fnf else: env_noise = None columns = list((range(0,num_mfcc))) column_type = [] for i in columns: column_type.append('"'+str(i)+'" REAL') c.execute(''' CREATE TABLE IF NOT EXISTS mfcc_40(%s,filename TEXT, noisegroup TEXT, noiselevel REAL, dataset INT, speaker INT, intensity INT, statement INT, repetition INT, speaker_sex INT, label INT) ''' % ", ".join(column_type)) conn.commit()
# -*- coding: utf-8 -*- """ Created on Fri Mar 19 08:49:04 2021 @author: CS """ # 改变采样率,统一为44.1kHz import librosa import numpy as np import soundfile as sf import os file_path = 'D:/Project/DCASE_test/Data/Data_ShipsEar/' sr_output = 44100 out_path = 'D:/Project/DCASE_test/Data/test/' file_list = os.listdir(file_path) for file in file_list: wav_path = file_path + file data, sr = librosa.load(wav_path, None) data_output = librosa.resample(data.astype(np.float32), sr, sr_output) out_name = out_path + file sf.write(out_name, data_output, sr_output)
https://colab.research.google.com/drive/1LZ9BX53fhzO6o-zbbmju_Mi6HV1yu7GK """ import librosa import matplotlib.pyplot as plt import librosa.display #part1 - preprocessing the signal #--------------------------------------------------------------------------- #A audio(Music-Jazz) is played for 30 secs and recorded. #15 lakh plus samples are collected and sampling frequency is 48000. #Therefore in one second 48000 samples are collected that means #the total length of signal in time domain is approximately 32 #secs. for analysation we need to have approximately only 50,000 #samples therefore we need to downsample the signal by #1516541/50,0000 == 30.33 or aproximately 31. x, sr = librosa.load('/content/jazz-mp3.mp3') print(sr) print(len(x)) #length comes out to be 696663, far greater than 50,000. We see that Sr is 22050 meaning that in 1 second #22050 samples are collected, this shows that the music ran for 699663/22050 secs == 31.73 secs which is true. #to reduce the no of samples approximately we have to change the sampling rate. #To find the new sampling rate audio_time = len(x) // sr new_sr = 50000 // audio_time print(new_sr) #new_sr is equal to approx 1582.544 #Load the signal again but this with the new sr x, sr = librosa.load('/content/jazz-mp3.mp3', sr=new_sr) print(len(x)) #no of samples now is 50000 exact. #plotting the graph
def load_wav(path) : return librosa.load(path, sr=hp.sample_rate)[0]
from wavenet.wavenet import Wavenet from auxilaries import mel_extractor os.environ['CUDA_VISIBLE_DEVICES'] = '' with open('../config_jsons/wavenet_mol.json', 'rt') as F: te_configs = json.load(F) te_hparams = Namespace(**te_configs) teacher_wavenet = Wavenet(te_hparams) with open('../config_jsons/parallel_wavenet.json', 'rt') as F: configs = json.load(F) hparams = Namespace(**configs) parallel_wavenet = ParallelWavenet(hparams, teacher=teacher_wavenet) seq_len = 7680 wav_val, _ = librosa.load('test_data/test.wav', sr=16000) batch_size = 4 wav_val = wav_val[:batch_size * seq_len].reshape([batch_size, seq_len]) wav_shape = [batch_size, seq_len] mel_shape = [batch_size, 39, 80] mel_val = np.zeros(mel_shape) for i in range(batch_size): mel_val[i] = mel_extractor.melspectrogram(wav_val[i]) mel_ph = tf.placeholder(tf.float32, mel_shape, name='mel_ph') wav_ph = tf.placeholder(tf.float32, wav_shape, name='wav_ph') inputs = {'mel': mel_ph, 'wav': wav_ph} tf.set_random_seed(12345) pff_dict = parallel_wavenet.feed_forward(inputs)
""" import glob import os import librosa import numpy as np import matplotlib.pyplot as plt from matplotlib.pyplot import specgram from librosa.display import waveplot plt.close('all') sound_file_paths = "a039_10_20_forest_path.wav" #"a038_30_40_home.wav" parent_dir = 'small_data/' X, sr = librosa.load(os.path.join(parent_dir, sound_file_paths)) S = librosa.feature.melspectrogram(X, sr=sr, n_mels=128) # Convert to log scale (dB). We'll use the peak power as reference. log_S = librosa.logamplitude(S, ref_power=np.max) # Make a new figure plt.figure() librosa.display.waveplot( np.array(X), sr=22050, ) plt.figure() librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel')
import matplotlib.pyplot as plt import librosa import librosa.display import tensorflow import sklearn from keras.layers import Dense, Conv1D, MaxPool1D,\ Input, BatchNormalization, Activation from keras.models import Sequential, Model # 로드 데이터 # y1, sr1 = librosa.load( # 'c:/nmb/nmb_data/F1_high.wav' # ) # 여성 화자 y2, sr2 = librosa.load('c:/nmb/nmb_data/M2_low.wav') # 남성 화자 # 필요함수 생성 (노이즈 생성, 정규화) def noising(data, noise_factor): noise = np.random.randn(len(data)) augmented_data = data + noise_factor * noise augmented_data = augmented_data.astype(type(data[0])) return augmented_data def normalize(data, axis=0): return sklearn.preprocessing.minmax_scale(data, axis=axis) # 원본 음성, 노이즈 합성
def gen_chroma_stft(file_path): y, sr = librosa.load(get_binary_data_from_gcs(file_path), sr=None) return librosa.feature.chroma_stft(y, sr)
import librosa filename = "C:\\Users\\mwang\\sheet_music_project\\music\\fur_elise\\fur_elise.mp3" # y, sr = librosa.load(filename) # filename = librosa.util.example_audio_file() print(filename) y, sr = librosa.load(filename) tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr) print('Estimated tempo: {:.2f} beats per minute'.format(tempo))
for o, a in myOpts: if o == '-i': json_file = a with open(json_file) as file: cp = json.load(file) try: x_train = np.load("x_train.npy") except: # Build training set audio_files = get_audiofiles(cp["audio_folder"]) y_train = np.array([]) x_train = np.array([]) for file in audio_files: audio_samples, sample_rate = librosa.load(file) audio_samples = librosa.resample(audio_samples, sample_rate, cp["sample_rate"]) window_size = int(cp["short_term"] * cp["sample_rate"]) step_size = int(cp["step_size"] * cp["sample_rate"]) print("Window_size: ", window_size, "Step_size: ", step_size) no_of_samples = int( (audio_samples.shape[0] - window_size) / step_size) - 1 # dt = time between each feature vector dt = step_size / sample_rate print("Extracting features from ", file, "# samples: ", audio_samples.shape, " sr: ", cp["sample_rate"], " dt: ", dt, "# features: ", no_of_samples) for i in range(no_of_samples): y = audio_samples[(i * step_size):(i * step_size + window_size)]
def loadAudio(audioPath, offset=0.0, duration=None): y, sr = librosa.load(audioPath, offset=offset, duration=duration) return y, sr
""" Abstract : Librosa is a package which serves to study audio files with the help of this package we'll load a function that will read in the path to an audio file, using two functions call .amplitude and .stft with the help of matplotlib package to mash up frequency and amplitude into a resulting array ploted graph (NOTE : The default sampling rate used by Librosa is 22050, but I used 44100 due to the sample's lenght) """ import librosa import numpy as np import matplotlib.pyplot as plt # Wav file path fichier = (r"C:\Users\Adam\Desktop\audio.wav") y, sr = librosa.load(fichier, sr=44100) # Size of the fft n_fft = 2048 S = librosa.stft(y, n_fft=n_fft, hop_length=n_fft // 2) """ hop_length : The number of samples between successive frames n_fft and hop length determine frequency in function of time resolution converted to dB """ D = librosa.amplitude_to_db(np.abs(S), ref=np.max) # Calculate average over file D_AVG = np.mean(D, axis=1) plt.bar(np.arange(D_AVG.shape[0]), D_AVG) """
def get_mag(url): y, _ = librosa.load(url, sr=sample_rate) complex_spec = librosa.core.stft(y, n_fft=n_fft, win_length=n_fft, hop_length=hop_length) mag_spec = np.abs(complex_spec) if use_log: mag_spec = db_func(mag_spec) return mag_spec
def extract_features(dataset='train'): f = open(data_path + dataset + '_list.txt', 'r') i = 0 for file_name in f: # progress check # load audio file file_name = file_name.rstrip('\n') file_path = data_path + file_name y, sr = librosa.load(file_path, sr=22050) S = librosa.core.stft(y, n_fft=1024, hop_length=512, win_length=1024) D_harmonic, D_percussive = librosa.decompose.hpss(S) #Hmag = librosa.amplitude_to_db(D_harmonic) #Pmag = librosa.amplitude_to_db(D_percussive) D_H = np.abs(D_harmonic)**2 D_P = np.abs(D_percussive)**2 # mel spectrogram (512 --> 40) mel_basis = librosa.filters.mel(sr, 1024, n_mels=40) mel_H = np.dot(mel_basis, D_H) mel_P = np.dot(mel_basis, D_P) #log compression log_mel_H = librosa.power_to_db(mel_H) log_mel_P = librosa.power_to_db(mel_P) # mfcc (DCT) mfcc_H = librosa.feature.mfcc(S=log_mel_H, n_mfcc=13) mfcc_H_delta = librosa.feature.delta(mfcc_H) mfcc_H_delta2 = librosa.feature.delta(mfcc_H, order=2) mfcc_H = np.concatenate((mfcc_H, mfcc_H_delta, mfcc_H_delta2), axis=0) mfcc_H = mfcc_H.astype(np.float32) mfcc_P = librosa.feature.mfcc(S=log_mel_P, n_mfcc=13) mfcc_P_delta = librosa.feature.delta(mfcc_P) mfcc_P_delta2 = librosa.feature.delta(mfcc_P, order=2) mfcc_P = np.concatenate((mfcc_P, mfcc_P_delta, mfcc_P_delta2), axis=0) mfcc_P = mfcc_P.astype(np.float32) # to save the memory (64 to 32 bits) file_name = file_name.replace('.wav', '.npy') save_file_H = mfcc_path + 'Harmonic/' + file_name save_file_P = mfcc_path + 'Percussive/' + file_name if not os.path.exists(os.path.dirname(save_file_H)): os.makedirs(os.path.dirname(save_file_H)) if not os.path.exists(os.path.dirname(save_file_P)): os.makedirs(os.path.dirname(save_file_P)) np.save(save_file_H, mfcc_H) np.save(save_file_P, mfcc_P) rmse = librosa.feature.rmse(S=S)[0] save_file_r = rmse_path + file_name if not os.path.exists(os.path.dirname(save_file_r)): os.makedirs(os.path.dirname(save_file_r)) np.save(save_file_r, rmse) i = i + 1 if not (i % 10): print i f.close()
def load_file(input_filename, mono=True, sr=22050): # if mono is true, returns samples of shape (2, n, ) # else returns samples of shape (n, ) # sample rate refers to number of samples per second: default selected by None, librosa default is 22050 x, sr = librosa.load(input_filename, mono=mono, sr=sr) return x, sr
words, _ = zip(*count_pairs) words_size = len(words) print('词汇表大小:', words_size) word_num_map = dict(zip(words, range(len(words)))) to_num = lambda word: word_num_map.get(word, len(words)) labels_vector = [ list(map(to_num, label)) for label in labels] #print(wavs_file[0], labels_vector[0]) #wav/train/A11/A11_0.WAV -> [479, 0, 7, 0, 138, 268, 0, 222, 0, 714, 0, 23, 261, 0, 28, 1191, 0, 1, 0, 442, 199, 0, 72, 38, 0, 1, 0, 463, 0, 1184, 0, 269, 7, 0, 479, 0, 70, 0, 816, 254, 0, 675, 1707, 0, 1255, 136, 0, 2020, 91] #print(words[479]) #绿 label_max_len = np.max([len(label) for label in labels_vector]) print('最长句子的字数:', label_max_len) wav_max_len = 0 # 673 for wav in wav_files: wav, sr = librosa.load(wav, mono=True) mfcc = np.transpose(librosa.feature.mfcc(wav, sr), [1,0]) if len(mfcc) > wav_max_len: wav_max_len = len(mfcc) print("最长的语音:", wav_max_len) batch_size = 16 n_batch = len(wav_files) // batch_size # 获得一个batch pointer = 0 def get_next_batches(batch_size): global pointer batches_wavs = [] batches_labels = [] for i in range(batch_size):
def getBeats(audiosrc): y, sr = librosa.load(audiosrc) tempo, beats = librosa.beat.beat_track(y=y, sr=sr) ts = librosa.frames_to_time(beats, sr=sr) return(ts)
def make_spectrum(filename=None, y=None, is_slice=False, feature_type='logmag', mode=None, FRAMELENGTH=None, SHIFT=None, _max=None, _min=None): ''' Return: Sxx = [F, T] (is_slice==False) or [T//FRAMELENGTH, F, FRAMELENGTH] (is_slice==True) phase = [F, T] (is_slice==False) or [T//FRAMELENGTH, F, FRAMELENGTH] (is_slice==True) y = y ''' if y is not None: y = y else: y, sr = librosa.load(filename, sr=16000) if sr != 16000: raise ValueError('Sampling rate is expected to be 16kHz!') if y.dtype == 'int16': y = np.float32(y / 32767.) elif y.dtype != 'float32': y = np.float32(y) ### Normalize waveform # y = y / np.max(abs(y)) / 2. D = librosa.stft(y, center=False, n_fft=hp.n_fft, hop_length=hp.hop_length, win_length=hp.n_fft, window=scipy.signal.hamming) utt_len = D.shape[-1] phase = np.exp(1j * np.angle(D)) D = np.abs(D) ### Feature type if feature_type == 'logmag': Sxx = np.log1p(D) elif feature_type == 'lps': Sxx = np.log10(D**2) else: Sxx = D if mode == 'mean_std': mean = np.mean(Sxx, axis=1).reshape(((hp.n_fft // 2) + 1, 1)) std = np.std(Sxx, axis=1).reshape(((hp.n_fft // 2) + 1, 1)) + 1e-12 Sxx = (Sxx - mean) / std elif mode == 'minmax': Sxx = 2 * (Sxx - _min) / (_max - _min) - 1 # print("noisy_spec.shape before slice_pad:",Sxx.shape) if is_slice: Sxx = slice_pad(Sxx, SHIFT, seg_size=FRAMELENGTH, pad_value=0) # print("noisy_spec.shape after slice_pad:",Sxx.shape) return Sxx, phase, y
# coding: utf-8 from __future__ import unicode_literals import numpy as np import librosa import os audio_file_name = "./piano/piano_1.mp3" # 오디오 파일 읽기 y, sr = librosa.load(audio_file_name) # mfcc 추출 mfcc = librosa.feature.mfcc(y=y, sr=sr, n_fft=2048, n_mfcc=20) # 약 20*길이(초)*43 정도 (초당 약 43회 추출) # 사이즈 확인 print(mfcc.shape) # mfcc 변화량 추출 mfcc_delta = librosa.feature.delta(mfcc) print(mfcc_delta.shape) # 합치기 mfcc_and_delta = np.concatenate((mfcc, mfcc_delta), axis=0) print(mfcc_and_delta.shape) # 축 뒤집기 mfcc_and_delta = mfcc_and_delta.T print(mfcc_and_delta.shape) # 최종적으로 (43*길이, 40)의 모양이 되어야 함 overall_length = mfcc_and_delta.shape[0] #43*초 #10초씩 자르기 (10*43 = 430) current_time = 0 window_length = 10*43 X = []
def load_sound_files(file_paths): raw_sounds = [] for fp in file_paths: X, sr = librosa.load(fp) raw_sounds.append(X) return raw_sounds
num_of_test_samples = 300 batch_size = 64 no_epochs = 50 input_shape = 64 data_dir = 'data/' dataset = [] ### PREPARING THE DATASET ### for i in range(10): directory = data_dir + str(i) + "/" + str(i) for j in range(1, num_of_samples + 1): y, sr = librosa.load(directory + " (" + str(j) + ").wav") no_samples = len(y) #spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, hop_length= math.floor(no_samples/128.)) spectrogram = librosa.feature.melspectrogram( y=y, sr=sr, n_mels=input_shape, hop_length=math.floor(no_samples / input_shape)) dataset.append((spectrogram, i)) random.shuffle(dataset) data_train = dataset[:(num_of_samples - num_of_test_samples) * 10] data_test = dataset[(num_of_samples - num_of_test_samples) * 10:] X_train, y_train = zip(*data_train)
def read_wav(path, sr, duration=None, mono=True): wav, sr = librosa.load(path=path, sr=sr, mono=mono, duration=duration) return wav
import numpy as np # import matplotlib as mpl # mpl.use('Agg') import librosa as l #f rom matplotlib import pyplot # specify path to audio files, directory to contain numpy arrays audio_path = sys.argv[1] out_path = sys.argv[2] for audiofile in os.listdir(audio_path): if audiofile.endswith(".wav"): # Set 'y' to audio time series, 'sr' to sample rate y, sr = l.load(audio_path + audiofile) # Initialize the chromagram C = l.feature.chromagram(y=y, sr=sr, n_fft=4096, hop_length=64) # Uncomment to save chromagram to file np.save(out_path + "/" + audiofile[:-4], C) # The following can be uncommented to save a visual figure of the chromagram # # Make a new figure # pyplot.figure(figsize=(12,4)) # # Display the chromagram: the energy in each chromatic pitch class as a function of time # # To make sure that the colors span the full range of chroma values, set vmin and vmax # l.display.specshow(C, sr=sr, hop_length=64, x_axis='time', y_axis='chroma', vmin=0, vmax=1)
tb = TensorBoard(log_dir='C:/nmb/nmb_data/graph',histogram_freq=0, write_graph=True, write_images=True) history = model.fit(x_train, y_train, epochs=300, batch_size=16, validation_split=0.2, callbacks=[es, tb, lr, mc]) # 평가, 예측 model.load_weights('C:/nmb/nmb_data/h5/model_DNN_mels.h5') result = model.evaluate(x_test, y_test, batch_size=16) print("loss : ", result[0]) print("acc : ", result[1]) print("f1_score ", result[2]) pred_pathAudio = 'C:/nmb/nmb_data/pred_voice/' files = librosa.util.find_files(pred_pathAudio, ext=['wav']) files = np.asarray(files) for file in files: y, sr = librosa.load(file, sr=22050) mels = librosa.feature.melspectrogram(y, sr=sr, hop_length=128, n_fft=512) pred_mels = librosa.amplitude_to_db(mels, ref=np.max) pred_mels = pred_mels.reshape(1, pred_mels.shape[0], pred_mels.shape[1]) y_pred = model.predict(pred_mels) # print(y_pred) y_pred_label = np.argmax(y_pred) # print(y_pred_label) if y_pred_label == 0 : print(file,(y_pred[0][0])*100, '%의 확률로 여자입니다.') else: print(file,(y_pred[0][1])*100, '%의 확률로 남자입니다.') end_now = datetime.datetime.now() time = end_now - start_now print("time >> " , time) # time >> 0:00:33.975135
def get_features(audio_paths, track_id=None, param=param_default, source_sr=None, pass_random=True, offset=0.0): feature_list = [] # If input type is already audio array just pass it on if type(audio_paths) == np.ndarray: y = audio_paths # Potentially resample: if source_sr is not None and param['SAMPLING_RATE'] != source_sr: y = librosa.resample(y, orig_sr=source_sr, target_sr=param['SAMPLING_RATE']) if param['SAMPLING_RATE'] > source_sr: print('Warning: Tried to increase sampling rate.') # Otherwise load audio else: # Set correct path to file if type(audio_paths) == str: audio_path = audio_paths elif type(audio_paths) == dict and track_id is not None: audio_path = audio_paths[track_id] else: raise Exception('Incompatible parameters given to get_features.') # Potentially load offset and duration duration = None if param['single_slice_audio']: duration = param['sample_sec'] # if 'offset' in param.keys(): # offset = param['offset'] # Load track try: y, _ = librosa.load(audio_path, sr=param['SAMPLING_RATE'], duration=duration, offset=offset) except: # TODO: This is another quick hack and not ideal for many reasons (e.g. fixed feature size)... if pass_random: print( f'Cannot load audio file {audio_path}. Passing random features instead' ) return np.random.random((128, 126)) else: print(f'Cannot load audio file {audio_path}.') raise # Calculate spectrum y_stft_full = get_spectrum(y, param['N_FFT'], param['HOP_LENGTH']) # If HPSS, split spectrum here and perform feature extraction on both parts if 'USE_HPSS' in param.keys() and param['USE_HPSS']: y_h, y_p = librosa.decompose.hpss(y_stft_full) y_list = [y_h, y_p] else: y_list = [y_stft_full] for y_stft in y_list: # Get melspectrogram of track, as well as deltas melspec = get_melspec(y_stft, n_mels=param['MELSPEC_BANDS']) if param['USE_DELTA']: delta = librosa.feature.delta(melspec) if param['USE_DELTADELTA']: delta_delta = librosa.feature.delta(melspec, order=2) # Get MFCC if param['USE_MFCC'] or param['USE_MFCC_DELTA'] or param[ 'USE_MFCC_DELTADELTA']: mfcc = get_mfcc(melspec, n_mfcc=param['N_MFCC']) if param['USE_MFCC_DELTA']: mfcc_delta = librosa.feature.delta(mfcc) if param['USE_MFCC_DELTADELTA']: mfcc_delta_delta = librosa.feature.delta(mfcc, order=2) # Get Fluctogram if param['USE_FLUCT']: fluct, spec_contrac, spec_flat = get_fluctogram( y_stft, sr=param['SAMPLING_RATE'], n_fft=param['N_FFT']) if 'MASK_FLUCT' in param.keys() and param['MASK_FLUCT']: fluct = mask_fluctogram(fluct, spec_contrac, spec_flat, param) # Concatenate if param['USE_SPEC']: # Convert melspec from -80 to 0dB to range [0,1] spec = (melspec + 80.0) / 80.0 feature_list.append(spec) if param['USE_DELTA']: feature_list.append(delta) if param['USE_DELTADELTA']: feature_list.append(delta_delta) if param['USE_MFCC']: feature_list.append(mfcc) if param['USE_MFCC_DELTA']: feature_list.append(mfcc_delta) if param['USE_MFCC_DELTADELTA']: feature_list.append(mfcc_delta_delta) if param['USE_FLUCT']: feature_list.append(fluct) if 'USE_SC' in param.keys() and param['USE_SC']: feature_list.append(spec_contrac) if 'USE_SF' in param.keys() and param['USE_SF']: feature_list.append(spec_flat) features = np.concatenate(feature_list) return features