def load_audio_samples(audio_cache, sample_sources): # Create samples from audio_cache _AudioSamples = [] for audio_source in sample_sources: filename = audio_source['filename'] if filename not in audio_cache.keys(): print("Not cached audio file: {}. Skipping...".format(filename)) continue audio_data, audio_rate = audio_cache[filename] time_start = float(audio_source['start']) time_end = float(audio_source['end']) label = audio_source['label'] start_sample = librosa.time_to_samples(time_start, sr=audio_rate) end_sample = librosa.time_to_samples(time_end, sr=audio_rate) # todo trim noise try: cutout_data = audio_data[start_sample:end_sample] except IndexError: print("Error in getting part of audio file: Out of Range error") continue sample = create_sample(label, audio_rate, cutout_data) _AudioSamples.append(sample) return _AudioSamples
def build_song_segments( target_song, segments_df, songs_df ): # segments_df = all of tailor swift scenes, songs_df = a list of all of tailor's songs in db matches = [] wave_segments = [] target_remaining_samples = target_song while len(target_remaining_samples) > 10000: # TODO: only necessary to calculate onsets up to max length of segment, not entire remaining song # fix if too slow target_onset_envelope = librosa.onset.onset_strength( target_remaining_samples) # dot prod - correlate segments, penalize by sqrt of length, which makes longer segments better onset_silly_match = segments_df.onset_envelope.map( silly_segment_matcher(target_onset_envelope)) best_match_index = onset_silly_match.idxmax() best_match = segments_df.loc[best_match_index] matching_song = songs_df[songs_df['name'] == best_match['name']] wave = matching_song.song_wave.values[0] start_times = matching_song.scene_start_times_sec.values[0] start_time = start_times[best_match.index_in_song] # end of song if index_in_song is the last one end_time = start_times[best_match.index_in_song + 1] if best_match.index_in_song + 1 < len(start_times) \ else librosa.samples_to_time(len(wave)) start_sample = librosa.time_to_samples(start_time) end_sample = librosa.time_to_samples(end_time) matches.append((best_match['name'], start_time, end_time)) consumed_samples = end_sample - start_sample target_remaining_samples = target_remaining_samples[consumed_samples:] # don't reuse samples segments_df = segments_df.drop(best_match_index) wave_segments.append(wave[start_sample:end_sample]) return matches, np.concatenate(wave_segments)
def play(self, dur): times = sm.waves.tspan(dur) waves = [] for start, wdur, lf, hf, amp, amp_freq, amp_phase in zip( self.starts, self.durs, self.low_freqs, self.high_freqs, self.amps, self.amp_freqs, self.amp_phases ): ss = lr.time_to_samples(start, sm.sound.SAMPLE_RATE) es = lr.time_to_samples(start + wdur, sm.sound.SAMPLE_RATE) es = min(es, self.so.samples.size - 1) samps = self.so.samples[ss:es] if samps.size <= 0: continue if hf < lf: hf, lf = lf, hf if es - ss > 0.1 * sm.sound.SAMPLE_RATE: samps = sm.effects.band_pass(SoundObject(samps), lf, hf).samples peak = np.max(np.abs(samps)) if peak > 0.0: samps /= peak samp_dur = lr.samples_to_time(samps.size, sm.sound.SAMPLE_RATE) wave = np.interp( times, np.linspace(0, samp_dur, samps.size), samps, period=samp_dur ) * amp mod = sm.waves.sin(times, amp_freq, 1.0, amp_phase) wave *= mod waves.append(SoundObject(wave)) return sm.sound.join(waves)
def generate_features(albums_dict, album_label_dict): features = [] counter = 0 for album in albums_dict: album_title = path_to_album(album) for song in albums_dict[album]: counter += 1 song_path = os.path.join(album, song["filename"]) song_title = filename_to_title(song["filename"]) print(str(counter) + "th song: " + song_title) data, sr = librosa.load(song_path) if album_title in album_label_dict: if song_title in album_label_dict[album_title]: for intervals in album_label_dict[album_title][song_title]: start, end, chord = intervals[0], intervals[ 1], intervals[2] if end > start: start_index = librosa.time_to_samples(start) end_index = librosa.time_to_samples(end) audio_slice = data[int(start_index):int(end_index)] if len(audio_slice) == 0: continue mfccs = librosa.feature.mfcc(y=audio_slice, sr=sample_rate, n_mfcc=40) mfccs_processed = np.mean(mfccs.T, axis=0) features.append([mfccs_processed, chord]) return features
def parse_grid(grid_path): ''' note : parse a textgrid and then split the wav matches the textgrid arg : grid_path : textgrid path to parse ''' dir_path = os.path.dirname(grid_path) grid_filename = os.path.basename(grid_path) name = os.path.splitext(grid_filename)[0] wav_path = os.path.join(dir_path, name + ".wav") target_tier = "comma" tg_obj = tgt.read_textgrid(grid_path) # get objects(textgrid-tier & wav) matches the grid path tier_obj = tg_obj.get_tier_by_name(target_tier) wav_obj, sr = librosa.load(wav_path, sr=None) for idx in range(len(tier_obj)): part = tier_obj[idx] time_s = librosa.time_to_samples(part.start_time, sr) time_e = librosa.time_to_samples(part.end_time, sr) librosa.output.write_wav('{}_{}.wav'.format(name, idx), wav_obj[time_s:time_e], sr) with open("{}_{}.txt".format(name, idx), "w") as f: f.write(part.text)
def synthesize(inputs): """ Generate new Audio objects for output or further remixing. Parameters ---------- inputs: generator, list, or tuple. See _format_inputs for details on parsing inputs. Returns ------ An Audio object """ # First we organize our inputs. inputs = _format_inputs(inputs) max_time = 0.0 sample_rate = 44100 array_length = 20 * 60 # 20 minutes! array_shape = (2, sample_rate * array_length) sparse_array = lil_matrix(array_shape) initial_offset = 0 for i, (time_slice, start_time) in enumerate(inputs): # if we have a mono file, we return stereo here. resampled_audio, left_offset, right_offset = time_slice.get_samples() # set the initial offset, so we don't miss the start of the array if i == 0: initial_offset = max(left_offset * -1, right_offset * -1) # get the target start and duration start_time = start_time.delta * 1e-9 duration = time_slice.duration.delta * 1e-9 # find the max time if start_time + duration > max_time: max_time = start_time + duration # error if we'd go too far if start_time + duration > array_length: raise SynthesizeError("Amen can only synthesize up to 20 minutes of audio.") # get the target start and end samples starting_sample, _ = librosa.time_to_samples( [start_time, start_time + duration], sr=time_slice.audio.sample_rate ) # figure out the actual starting and ending samples for each channel left_start = starting_sample + left_offset + initial_offset right_start = starting_sample + right_offset + initial_offset # add the data from each channel to the array sparse_array[0, left_start : left_start + len(resampled_audio[0])] += resampled_audio[0] sparse_array[1, right_start : right_start + len(resampled_audio[1])] += resampled_audio[1] max_samples = librosa.time_to_samples([max_time], sr=sample_rate) truncated_array = sparse_array[:, 0:max_samples].toarray() return Audio(raw_samples=truncated_array)
def chromaplot(y, rate, start_t=0, stop_t=None, play=True, harmonic_input=False): start = librosa.time_to_samples(start_t) if stop_t is not None: stop = librosa.time_to_samples(stop_t) if harmonic_input is False: h, p = librosa.effects.hpss(y[start:stop]) else: h = y[start:stop] C = librosa.feature.chroma_cqt(y=h, sr=rate) plt.figure(figsize=(12, 4)) librosa.display.specshow(C, sr=rate, x_axis='time', y_axis='chroma', vmin=0, vmax=1) plt.title('Chromagram') plt.colorbar() plt.tight_layout() plt.show() if play: return play_button(y, rate, start_t, stop_t)
def synthesize(inputs): """ Generate new Audio objects for output or further remixing. Parameters ---------- inputs: generator, list, or tuple. See _format_inputs for details on parsing inputs. Returns ------ An Audio object """ # First we organize our inputs. inputs = _format_inputs(inputs) max_time = 0.0 sample_rate = 44100 array_length = 20 * 60 # 20 minutes! array_shape = (2, sample_rate * array_length) sparse_array = lil_matrix(array_shape) initial_offset = 0 for i, (time_slice, start_time) in enumerate(inputs): # if we have a mono file, we return stereo here. resampled_audio, left_offset, right_offset = time_slice.get_samples() # set the initial offset, so we don't miss the start of the array if i == 0: initial_offset = max(left_offset * -1, right_offset * -1) # get the target start and duration start_time = start_time.delta * 1e-9 duration = time_slice.duration.delta * 1e-9 # find the max time if start_time + duration > max_time: max_time = start_time + duration # error if we'd go too far if start_time + duration > array_length: raise SynthesizeError("Amen can only synthesize up to 20 minutes of audio.") # get the target start and end samples starting_sample, _ = librosa.time_to_samples([start_time, start_time + duration], sr=time_slice.audio.sample_rate) # figure out the actual starting and ending samples for each channel left_start = starting_sample + left_offset + initial_offset right_start = starting_sample + right_offset + initial_offset # add the data from each channel to the array sparse_array[0, left_start:left_start + len(resampled_audio[0])] += resampled_audio[0] sparse_array[1, right_start:right_start + len(resampled_audio[1])] += resampled_audio[1] max_samples = librosa.time_to_samples([max_time], sr=sample_rate) truncated_array = sparse_array[:, 0:max_samples].toarray() return Audio(raw_samples=truncated_array, sample_rate=sample_rate)
def pipeline(path, frame_ms=64, hop_ms=64): sig, rate = speech.read_soundfile(path) fsize = librosa.time_to_samples(float(frame_ms)/1000, rate)[0] hop = librosa.time_to_samples(float(hop_ms)/1000, rate)[0] frames = librosa.util.frame(sig, fsize, hop) rms = np.apply_along_axis(speech.rms, 0, frames) H, p = spectral_entropy(frames, rate, fsize) return sig, rate, frames, fsize, rms, H, p
def maptask_to_tacotron(output_path, timed_units_path, mono_path, pause_time=1, sr=20000): ''' This script should extract the text and audio part of utterences seperated by $pause_time from $person in the Maptask dataset. The audio snippets is cut out and stored as wav-files with a name according to (ex: q1ec1-0001, q1ec1-0002, ...). Each line in the produces txt-file has the following form: name_of_audio|utterence (string) name_of_audio|utterence (string) ''' mono_file_names = os.listdir(mono_path) mono_file_names.sort() # Nice to do in order when debugging file_txt = join(output_path, 'maptask') file_f = open(file_txt+'.f.txt', "w") file_g = open(file_txt+'.g.txt', "w") wavs_path = join(output_path, 'wavs') if not exists(wavs_path): pathlib.Path(wavs_path).mkdir(parents=True, exist_ok=True) # Iterate through all (mono) audiofiles, chop the audio in to utterences for mono_wav in tqdm(mono_file_names): if '.wav' in mono_wav: # failsafe # mono_wav: q1ec1.f.wav, q1ec1.g.wav, ... fpath = join(mono_path, mono_wav) # Full path to file # Load audio file sr, y = read(fpath) # get time and words from timed-units tu_data = get_time_filename_utterence(mono_wav, timed_units_path) for d in tu_data: start, end = d['time'] # time start = librosa.time_to_samples(start, sr=sr) end = librosa.time_to_samples(end, sr=sr) y_tmp = y[start:end] # write chopped audio to disk tmp_path = join(wavs_path, 'wavs', d['name']+'.wav') write(filename=tmp_path, rate=sr, data=y_tmp) # write corresponding row in txt s = d['name'] + '|' + d['words'] + '\n' if '.f.' in mono_wav: file_f.write(s) else: file_g.write(s) file_f.close() file_g.close()
def silence_fillers(file_path, file_name, start_time, end_time, sil_start_time, sil_end_time): org_track = AudioSegment.from_file(os.path.join(file_path, file_name)) org_track = org_track.set_frame_rate(22050) sample_rate = org_track.frame_rate sample_width = org_track.frame_width samples = np.array(org_track.get_array_of_samples()) for idx in range(len(start_time)): pydub_start_t = start_time[idx] * 1000 #pydub works in ms pydub_end_t = end_time[idx] * 1000 start_sample_id = librosa.time_to_samples(start_time[idx], sr=sample_rate)[0] end_sample_id = librosa.time_to_samples(end_time[idx], sr=sample_rate)[0] temp_track = AudioSegment.silent(duration=(pydub_end_t - pydub_start_t), frame_rate=sample_rate) temp_array = np.array(temp_track.get_array_of_samples()) samples[start_sample_id:start_sample_id + temp_array.shape[0]] = temp_array # do a median filtering on samples samples[start_sample_id - 7:start_sample_id + temp_array.shape[0] + 7] = scipy.signal.medfilt( samples[start_sample_id - 7:start_sample_id + temp_array.shape[0] + 7], kernel_size=3) #to deal with the noise in the silences for idx in range(len(sil_start_time)): pydub_start_t = float(sil_start_time[idx]) * 1000 #pydub works in ms pydub_end_t = float(sil_end_time[idx]) * 1000 start_sample_id = librosa.time_to_samples(float(sil_start_time[idx]), sr=sample_rate)[0] end_sample_id = librosa.time_to_samples(float(sil_end_time[idx]), sr=sample_rate)[0] temp_track = AudioSegment.silent(duration=(pydub_end_t - pydub_start_t), frame_rate=sample_rate) temp_array = np.array(temp_track.get_array_of_samples()) samples[start_sample_id:start_sample_id + temp_array.shape[0]] = temp_array # do a median filtering on samples samples[start_sample_id - 7:start_sample_id + temp_array.shape[0] + 7] = scipy.signal.medfilt( samples[start_sample_id - 7:start_sample_id + temp_array.shape[0] + 7], kernel_size=3) sf.write(os.path.join(file_path, 'new_' + file_name), samples.astype('int16'), sample_rate)
def get_label_data(annotation, audio, label, sr): start_samples_indxs = librosa.time_to_samples(annotation[annotation.label ==\ label].start.values, sr) finish_samples_indxs = librosa.time_to_samples(annotation[annotation.label ==\ label].finish.values, sr) data = [] for s, f in zip(start_samples_indxs, finish_samples_indxs): data.append(audio[np.arange(s, f)]) return data
def play_button(y, rate, start_t=0, stop_t=None): '''Insert a play button that clips the audio between start and stop times. By default, play the entire audio file.''' start = librosa.time_to_samples(start_t) if stop_t is not None: stop = librosa.time_to_samples(stop_t) return IPython.display.display( IPython.display.Audio(data=y[start:stop], rate=rate))
def sound_data(self): (sound_data, sr) = self.recording.data if self.start_t is None: self.start_t = 0 if self.end_t is None: self.end_t = librosa.samples_to_time(sound_data.size, sr) start_i = librosa.time_to_samples(self.start_t, sr) end_i = librosa.time_to_samples(self.end_t, sr) return sound_data[start_i:end_i]
def loss(drone: Drone, so: SoundObject): out = 0 for _ in range(TRIALS): drone_wave = drone.play(SAMPLE_DURATION) st = random.uniform(0, so.duration - SAMPLE_DURATION) et = st + SAMPLE_DURATION ss = lr.time_to_samples(st, sm.sound.SAMPLE_RATE) es = lr.time_to_samples(et, sm.sound.SAMPLE_RATE) stft1 = lr.stft(so.samples[ss:es]) stft2 = lr.stft(drone_wave.samples) out += np.mean(np.abs(stft1 - stft2)) return out / TRIALS
def get_unlabel_data(annotation, audio, labels, sr): start_samples_indxs = librosa.time_to_samples(\ annotation[annotation.label.isin(labels)].start.values, sr) finish_samples_indxs = librosa.time_to_samples(\ annotation[annotation.label.isin(labels)].finish.values, sr) finish_samples_indxs = np.hstack([[0], finish_samples_indxs]) start_samples_indxs = np.hstack([start_samples_indxs, [len(start_samples_indxs) - 1]]) data = [] for s, f in zip(finish_samples_indxs, start_samples_indxs): data.append(audio[np.arange(s, f)]) return data
def process_recording(self, recording): import librosa import soundfile original_file = self.project.get_recording_path( recording["recording_filename"], self.input_profile ) destination_file = os.path.join( self.output_directory(), os.path.splitext(recording["recording_filename"])[0] + ".wav", ) os.makedirs(name=os.path.dirname(destination_file), exist_ok=True) vettoed_segments = self.segments[ self.segments["recording_filename"] == recording["recording_filename"] ] signal, sr = librosa.load(original_file, sr=None, mono=False) onsets = librosa.time_to_samples( times=vettoed_segments["segment_onset"].values / 1000, sr=sr ) offsets = librosa.time_to_samples( times=vettoed_segments["segment_offset"].values / 1000, sr=sr ) if signal.ndim == 1: for i in range(len(onsets)): signal[onsets[i] : offsets[i]] = 0 soundfile.write(destination_file, signal, samplerate=sr) else: for i in range(len(onsets)): signal[:, onsets[i] : offsets[i]] = 0 soundfile.write(destination_file, np.transpose(signal), samplerate=sr) return pd.DataFrame( [ { "original_filename": recording["recording_filename"], "converted_filename": os.path.splitext( recording["recording_filename"] )[0] + ".wav", "success": True, } ] )
def get_chords_in_interval(self, audio, chord_intervals, interval): start_index = librosa.time_to_samples(interval[0]) end_index = librosa.time_to_samples(interval[1]) audio_slice = audio[int(start_index):int(end_index)] ref_start, ref_end = interval[0], interval[1] chords = [] curr_interval = chord_intervals[0] index = 0 while curr_interval[0] < ref_end and index < len(chord_intervals): curr_interval = chord_intervals[index] if curr_interval[1] > ref_start: chords.append(curr_interval[2]) index += 1 return audio_slice, chords
def _trim(raw_audio, sr=read_audio.DEFAULT_SR): ''' Finds the first onset of the sound, returns a good start time and end time that isolates the sound :param raw_audio: np array of audio data, from librosa.load :param sr: sample rate :return: dict with 'start' and 'end', in seconds ''' start = 0.0 end = None # Add an empty second so that the beginning onset is recognized silence_to_add = 1.0 raw_audio = np.append(np.zeros(int(silence_to_add * sr)), raw_audio) # Spectral flux hop_length = int(librosa.time_to_samples(1. / 200, sr=sr)) onsets = librosa.onset.onset_detect(y=raw_audio, sr=sr, hop_length=hop_length, units='time') if len(onsets) == 0: return {'start': start, 'end': end} elif len(onsets) > 1: # If there are multiple onsets, cut it off just before the second one end = onsets[1] - (silence_to_add + 0.01) start = max(onsets[0] - (silence_to_add + 0.01), 0.0) return {'start': start, 'end': end}
def default_onset(y, fs, window_length=51, polyorder=3): # These parameters are taken directly from the paper n_fft = 1024 hop_length = int(librosa.time_to_samples(1. / 200, sr=fs)) n_mels = 138 fmin = 27.5 fmax = 16000. # The paper uses a log-frequency representation, # but for simplicity, we'll use a Mel spectrogram instead. S = librosa.feature.melspectrogram(y, sr=fs, n_fft=n_fft, hop_length=hop_length, fmin=fmin, fmax=fmax, n_mels=n_mels) # compute the onset strength envelope # onset events using the librosa defaults. env_default = librosa.onset.onset_strength(y=y, sr=fs, hop_length=hop_length) env_default = smoothing.smooth( env_default, window_length, polyorder) # window size 51, polynomial order 3 onset_def = librosa.onset.onset_detect(y=env_default, sr=fs, hop_length=hop_length, units='time') return onset_def
def makeAudio(events, iteration, stimdir, spatial_flag=False): eventsinsamples = librosa.time_to_samples(events,sr=int(sr_audio)) # audiobufffers for spatial and mono audio audiobuffer_L = np.zeros(max(eventsinsamples) + largestsampnum) audiobuffer_R = np.zeros(max(eventsinsamples) + largestsampnum) y_mono = y for startpos in eventsinsamples: random_deg = np.random.randint(N) y_l = samples[random_deg][0] y_r = samples[random_deg][1] if spatial_flag == True: audiobuffer_L[startpos:(startpos + len(y_l))] = audiobuffer_L[startpos:(startpos + len(y_l))] + y_l audiobuffer_R[startpos:(startpos + len(y_r))] = audiobuffer_R[startpos:(startpos + len(y_r))] + y_r if spatial_flag == False: audiobuffer_L[startpos:(startpos + len(y_mono))] = audiobuffer_L[startpos:(startpos + len(y_mono))] + y_mono audiobuffer_R[startpos:(startpos + len(y_mono))] = audiobuffer_R[startpos:(startpos + len(y_mono))] + y_mono audio_l = 0.8*audiobuffer_L/max(audiobuffer_L) audio_r = 0.8*audiobuffer_R/max(audiobuffer_R) audio = np.array([audio_l, audio_r]) audiofi = os.path.join(stimdir, config.dist_type + '_' + config.strs['quantize'] + '-' + str(config.qsteps) + '_' + config.strs['binaural'][0] + '_' + str(config.N) + '_' + iteration + '.wav') sf.write(audiofi, audio.T, samplerate=int(sr_audio)) print('creating', audiofi) return audio
def makeAudio(events, iteration, stimdir, spatial_flag=False): eventsinsamples = librosa.time_to_samples(events,sr=sr_audio) # audiobufffers for spatial and mono audio audiobuffer_L = np.zeros(max(eventsinsamples) + largestsampnum) audiobuffer_R = np.zeros(max(eventsinsamples) + largestsampnum) y_mono = y for startpos in eventsinsamples: random_deg = np.random.randint(100) y_l = samples[random_deg][0] y_r = samples[random_deg][1] if spatial_flag == True: audiobuffer_L[startpos:(startpos + len(y_l))] = audiobuffer_L[startpos:(startpos + len(y_l))] + y_l audiobuffer_R[startpos:(startpos + len(y_r))] = audiobuffer_R[startpos:(startpos + len(y_r))] + y_r if spatial_flag == False: audiobuffer_L[startpos:(startpos + len(y_mono))] = audiobuffer_L[startpos:(startpos + len(y_mono))] + y_mono audiobuffer_R[startpos:(startpos + len(y_mono))] = audiobuffer_R[startpos:(startpos + len(y_mono))] + y_mono #audio_l = np.sum(audiobuffer_L, axis=0) #audio_r = np.sum(audiobuffer_R, axis=0) audio_l = 0.8*audiobuffer_L/max(audiobuffer_L) audio_r = 0.8*audiobuffer_R/max(audiobuffer_R) audio = np.array([audio_l, audio_r]) audiofi = os.path.join(stimdir, dist_type[0] + '_' + binaural_str[0] + '_' + str(N) + '_' + str(np.round(iteration,2)) + '.wav') sf.write(audiofi, audio.T, samplerate=sr_audio) print('creating', audiofi) return audio
def split_audio(audio, beats, sr): beats_sample = librosa.time_to_samples(beats, sr=sr) audio_split = [ audio[beats_sample[i]:beats_sample[i + 1]] for i in range(len(beats_sample) - 1) ] return audio_split
def run_algorithm(audio_file, n_templates=[0, 0, 0], output_savename="extracted_loop"): """Complete pipeline of algorithm. Parameters ---------- audio_file : string Path to audio file to be loaded and analysed. n_templates : list of length 3 The number of sound, rhythm and loop templates. Default value (0,0,0) causes the script to estimate reasonable values. output_savename: : string Base string for saved output filenames. Returns ------- A set of files containing the extracted loops. Examples -------- >>> run_algorithm("example_song.mp3", [40,20,7], "extracted_loop") See also -------- tensorly.decomposition.non_negative_tucker """ assert os.path.exists(audio_file) assert len(n_templates) == 3 assert type(n_templates) is list # Load mono audio: signal_mono, fs = librosa.load(audio_file, sr=None, mono=True) # Use madmom to estimate the downbeat times: downbeat_times = get_downbeats(signal_mono) # Convert times to frames so we segment signal: downbeat_frames = librosa.time_to_samples(downbeat_times, sr=fs) # Create spectral cube out of signal: spectral_cube = make_spectral_cube(signal_mono, downbeat_frames) # Validate the input n_templates (inventing new ones if any is wrong): n_sounds, n_rhythms, n_loops = validate_template_sizes( spectral_cube, n_templates) # Use TensorLy to do the non-negative Tucker decomposition: core, factors = tld.non_negative_tucker(np.abs(spectral_cube), [n_sounds, n_rhythms, n_loops], n_iter_max=500, verbose=True) # Reconstruct each loop: for ith_loop in range(n_loops): # Multiply templates together to get real loop spectrum: loop_spectrum = create_loop_spectrum(factors[0], factors[1], core[:, :, ith_loop]) # Choose best bar to reconstruct from (we will use its phase): bar_ind = choose_bar_to_reconstruct(factors[2], ith_loop) # Reconstruct loop signal by masking original spectrum: ith_loop_signal = get_loop_signal(loop_spectrum, spectral_cube[:, :, bar_ind]) # Write signal to disk: librosa.output.write_wav( "{0}_{1}.wav".format(output_savename, ith_loop), ith_loop_signal, fs)
def get_start_sample(self) -> int: if Beat.INDEX_VALUE == 'samples': return self.index elif Beat.INDEX_VALUE == 'time': return librosa.time_to_samples(self.index, sr=util.SAMPLE_RATE) else: raise NotImplementedError("Only samples and time are supported")
def test_tempo(): def __test(tempo, sr, hop_length, ac_size, aggregate, y): tempo_est = librosa.beat.tempo(y=y, sr=sr, hop_length=hop_length, ac_size=ac_size, aggregate=aggregate) # Being within 5% for the stable frames is close enough if aggregate is None: win_size = int(ac_size * sr // hop_length) assert np.all( np.abs(tempo_est[win_size:-win_size] - tempo) <= 0.05 * tempo) else: assert np.abs(tempo_est - tempo) <= 0.05 * tempo, (tempo, tempo_est) for sr in [22050, 44100]: for tempo in [40, 60, 80, 110, 160]: # Make a pulse train at the target tempo y = np.zeros(20 * sr) delay = np.asscalar(librosa.time_to_samples(60. / tempo, sr=sr)) y[::delay] = 1 for hop_length in [512, 1024]: for ac_size in [4, 8]: for aggregate in [None, np.mean]: yield __test, tempo, sr, hop_length, ac_size, aggregate, y
def main(args): outname = args.outdir y, sr = librosa.load(args.wavpath, sr=None) adjusted_onset_jam = jams.load(args.onsetjams) ann = adjusted_onset_jam.search(namespace='onset')[0] adjusted_onset_times = ann.to_event_values()[0] adj_on_samps = librosa.time_to_samples(adjusted_onset_times, sr=sr) y_chopt = chop_sig(y, adj_on_samps) print("about to clear csv files") with open(outname + '_pt.csv', 'w') as csvfile: pass with open(outname + '_onoff.csv', 'w') as csvfile: pass k = 0 for seg, seg_start_time in zip(y_chopt, adjusted_onset_times): if k % 20 == 0: print(k, len(y_chopt)) k += 1 offset_time, pitch_track, t_step = segment_offset( seg, sr, seg_start_time) with open(outname + '_pt.csv', 'a') as pt: writer = csv.writer(pt, delimiter=',') for i, f in enumerate(pitch_track): if f > 0: writer.writerow([seg_start_time + i * float(t_step), f]) with open(outname + '_onoff.csv', 'a') as onoff: writer = csv.writer(onoff, delimiter=',') writer.writerow([seg_start_time, offset_time]) return 0
def find_bounds_of_chord(song_analysis, chord_regex, rate): #bounds_chord = [librosa.time_to_samples(t, song[1]) for t in timestamps_chord] pairs = list(zip(song_analysis[:-1], song_analysis[1:])) time_bounds = [(float(chord['timestamp']), float(next_chord['timestamp'])) for chord, next_chord in pairs if re.match(chord_regex, chord['label'])] bounds = [librosa.time_to_samples(t, rate)for t in time_bounds] return bounds
def __init__(self, dataset, sr=22050, frameSize=2048, hopSize=512, transform=None, cacheSize=4): self.dataset = dataset self.sr = sr self.frameSize = frameSize self.hopSize = hopSize self.transform = transform self.cacheSize = cacheSize self.frameDt = float(frameSize) / sr # count frames in dataset nFramesList = [] for pathPair in dataset.pathPairs: wavPath = pathPair.wav duration = librosa.get_duration(filename=wavPath) nSamples = librosa.time_to_samples(duration, sr=self.sr) nFrames = 1 + int( (nSamples - self.frameSize) / float(self.hopSize)) nFramesList.append(nFrames) # check validation sStart = librosa.frames_to_samples(nFrames - 1, hop_length=self.hopSize) sEnd = sStart + self.frameSize assert (nSamples > 0) and ( sEnd <= nSamples), f'{nFrames}:{sStart}_{sEnd}, {nSamples}' self.frameCumsum = np.cumsum(nFramesList) # FIFO cache self._sampleCache = deque(maxlen=cacheSize) self._sampleIdxCache = deque(maxlen=cacheSize)
def _slice_audio_by_interval(y: np.ndarray, sr: float, hop_length: int = 512, segmentation_interval_s: float = 1.0, **_kwargs) -> Tuple[np.ndarray, np.ndarray]: interval_samples: int = librosa.time_to_samples( segmentation_interval_s, sr=sr) total_samples: int = y.size # y is monophonic num_segments: int = np.ceil(total_samples / interval_samples) onset_samples: np.ndarray = interval_samples * np.arange(num_segments) onset_frames: np.ndarray = librosa.samples_to_frames( onset_samples, hop_length=hop_length) duration_samples: np.ndarray = interval_samples * np.ones_like( onset_frames) # adjust duration of last fragment to end of file remainder = total_samples % interval_samples if remainder == 0: # `total_samples` is divisible by `interval_samples`: ceil operation above was not needed pass else: # `total_samples` is not divisible by `interval_samples`: last slice is shorter duration_samples[-1] = remainder duration_frames: np.ndarray = librosa.samples_to_frames( duration_samples, hop_length=hop_length) return onset_frames, duration_frames
def get_onset(wav_path): y, sr = librosa.core.load(wav_path, sr=None) sos = signal.butter(25, 100, btype='highpass', fs=sr, output='sos') wav_data = signal.sosfilt(sos, y) wav_data = normalize(wav_data) sodf = SpectralOnsetProcessor(onset_method='complex_flux', fps=50, filterbank=LogarithmicFilterbank, fmin=100, num_bands=24, norm=True) from madmom.audio.signal import Signal onset_strength = (sodf(Signal(data=wav_data, sample_rate=sr))) onset_strength = librosa.util.normalize(onset_strength) h_length = int(librosa.time_to_samples(1. / 50, sr=sr)) onset_times = librosa.onset.onset_detect(onset_envelope=onset_strength, sr=sr, hop_length=h_length, units='time', pre_max=5, post_max=5, pre_avg=5, post_avg=5) f = open(onset_path, 'w') for x in onset_times: f.write(f"{x}\n") return onset_times
def test_tempo(): def __test(tempo, sr, hop_length, ac_size, aggregate, y): tempo_est = librosa.beat.tempo(y=y, sr=sr, hop_length=hop_length, ac_size=ac_size, aggregate=aggregate) # Being within 5% for the stable frames is close enough if aggregate is None: win_size = int(ac_size * sr // hop_length) assert np.all(np.abs(tempo_est[win_size:-win_size] - tempo) <= 0.05 * tempo) else: assert np.abs(tempo_est - tempo) <= 0.05 * tempo, (tempo, tempo_est) for sr in [22050, 44100]: for tempo in [40, 60, 80, 110, 160]: # Make a pulse train at the target tempo y = np.zeros(20 * sr) delay = np.asscalar(librosa.time_to_samples(60./tempo, sr=sr)) y[::delay] = 1 for hop_length in [512, 1024]: for ac_size in [4, 8]: for aggregate in [None, np.mean]: yield __test, tempo, sr, hop_length, ac_size, aggregate, y
def test(filename=None): import random, os import matplotlib.pyplot as plt from sys import argv #signal, params = read_signal(sound,WINSIZE) scenario=None if filename != None: scene = os.path.basename(filename)[0] else: filename = random.choice([x for x in os.listdir("tmp/") if os.path.splitext(x)[1] == ".flac"]) scene = filename[0] filename = "tmp/"+filename print(filename) truths = vad.load_truths() signal,rate = speech.read_soundfile(filename) seconds = float(len(signal))/rate winsize = librosa.time_to_samples(float(WINMS)/1000, rate)[0] window = sp.hanning(winsize) ltsd = LTSD(winsize,window,5) res, threshold,nstart,nend = ltsd.compute(signal) segments = ltsd.segments(res, threshold) #print(float(len(signal))/rate, librosa.core.frames_to_time(len(res), 8000, winsize/2)) segments = librosa.core.frames_to_time(segments, rate, winsize/2) fig = plt.figure() ax = fig.add_subplot(111) #ax.plot((signal/np.max(signal))*np.mean(res)+np.mean(res)) ax.plot(np.linspace(0,seconds, len(res)), res) ax.plot([0, seconds], [threshold, threshold]) vad.plot_segments(truths[scene]['combined'], segments, ax) n1 = float(nstart)/rate n2 = float(nend)/rate ax.vlines([n1,n2], -20,20) plt.show()
def pipeline(path, frame_ms=30, hop_ms=15): print("load") #sig, rate = librosa.load(path) #sig2, rate2 = ad.read_file(path) sig, rate = soundfile.read(path) sig = signal.wiener(sig) print("rate", rate) fsize = librosa.time_to_samples(float(frame_ms)/1000, rate)[0] hop = librosa.time_to_samples(float(hop_ms)/1000, rate)[0] print("frame size", fsize, "hop", hop) frames = librosa.util.frame(sig, fsize, hop) w = signal.hann(fsize) #frames_W = np.zeros_like(frames) #print(frames.shape) #frames = frames.T #print(w.shape) print("windowing function") frames_w = np.apply_along_axis(lambda x,w: x*w, 0, frames, w) frames = frames_w print("window suppression") frames = np.apply_along_axis(lambda x,w: x/(w+1e-15), 0, frames, w) # frames_W[i] = signal.convolve(frames[i],w, mode='same') #frames = frames_W.T #w = signal.correlate(w,w,mode='full') #w = w[w.size/2:] #print(frames.shape) #frames = sigutil.enframe(sig, fsize, hop, signal.hann) print("normalized autocorrelation") naccs = np.apply_along_axis(nacc, 0, frames) print("trimming") naccs = np.apply_along_axis(trim_frame, 0, naccs) print(naccs.shape) minacs = np.zeros_like(naccs) for i in range(len(naccs.T)): minacs[:,i] = min_ac(naccs.T, i) print(minacs.shape) print("variances") #acvars = np.apply_along_axis(acvar, 0, naccs2) acvars = np.apply_along_axis(acvar, 0, minacs) print("ltacs") ltacs = np.zeros_like(acvars) for i in range(len(acvars)): ltacs[i] = ltac(acvars, i) return sig, rate, frames, fsize, minacs, acvars, ltacs
def __test(): beat = audio.timings['beats'][0] start = beat.time.delta * 1e-9 duration = beat.duration.delta * 1e-9 starting_sample, ending_sample = librosa.time_to_samples([start, start + duration], beat.audio.sample_rate) samples, left_offset, right_offset = beat.get_samples() left_offsets, right_offsets = beat._get_offsets(starting_sample, ending_sample, beat.audio.num_channels) duration = beat.duration.delta * 1e-9 starting_sample, ending_sample = librosa.time_to_samples([0, duration], audio.sample_rate) initial_length = ending_sample - starting_sample left_offset_length = initial_length - left_offsets[0] + left_offsets[1] right_offset_length = initial_length - right_offsets[0] + right_offsets[1] assert(len(samples[0]) == left_offset_length) assert(len(samples[1]) == right_offset_length)
def __test(): beat = audio.timings['beats'][0] samples, left_offset, right_offset = beat.get_samples() start = beat.time.delta * 1e-9 duration = beat.duration.delta * 1e-9 starting_sample, ending_sample = librosa.time_to_samples([start, start + duration], beat.audio.sample_rate) left_offsets, right_offsets = beat._get_offsets(starting_sample, ending_sample, beat.audio.num_channels) start_sample = left_offsets[0] * -1 end_sample = len(samples[0]) - left_offsets[1] reset_samples = samples[0][start_sample : end_sample] original_samples = audio.raw_samples[0, starting_sample : ending_sample] assert(np.array_equiv(reset_samples, original_samples))
def get_samples(self): """ Gets the samples corresponding to this TimeSlice from the parent audio object. """ start = self.time.delta * 1e-9 duration = self.duration.delta * 1e-9 starting_sample, ending_sample = librosa.time_to_samples([start, start + duration], self.audio.sample_rate) left_offsets, right_offsets = self._get_offsets(starting_sample, ending_sample, self.audio.num_channels) samples = self._offset_samples(starting_sample, ending_sample, left_offsets, right_offsets, self.audio.num_channels) return samples, left_offsets[0], right_offsets[0]
def vad(soundfile, noisefile=None): signal,rate = speech.read_soundfile(soundfile) if noisefile != None: noise,nrate = speech.read_soundfile(noisefile) print("found noisefile: "+noisefile) else: noise = None seconds = float(len(signal))/rate winsize = librosa.time_to_samples(float(WINMS)/1000, rate)[0] window = sp.hanning(winsize) ltsd = LTSD(winsize,window,5, init_noise=noise) res, threshold,nstart,nend = ltsd.compute(signal) segments, = ltsd.segments(res, threshold) #print(float(len(signal))/rate, librosa.core.frames_to_time(len(res), 8000, winsize/2)) segments = librosa.core.samples_to_time(segments, rate).tolist() indexes = [] for s in segments: indexes += s indexes.append(seconds) return indexes
def __test(times, frames, sr, hop_length, click_freq, click_duration, click, length): y = librosa.clicks(times=times, frames=frames, sr=sr, hop_length=hop_length, click_freq=click_freq, click_duration=click_duration, click=click, length=length) if times is not None: nmax = librosa.time_to_samples(times, sr=sr).max() else: nmax = librosa.frames_to_samples(frames, hop_length=hop_length).max() if length is not None: assert len(y) == length elif click is not None: assert len(y) == nmax + len(click)
def segment(audio_file, mode, db_delta_thresh=2.5, **kwargs): x, fs = claudio.read(audio_file, samplerate=22050, channels=1, bytedepth=2) if mode == 'hll': onset_times = hll_onsets(audio_file) else: onset_times = ONSETS.get(mode)(x, fs, **kwargs) onset_idx = librosa.time_to_samples(onset_times, fs) log_env_lpf = log_envelope(x, fs, 100) recs = [] for time, idx in zip(onset_times, onset_idx): x_m = log_env_lpf[idx: idx + int(fs)] rec = dict(time=time, env_max=x_m.max(), env_mean=x_m.mean(), env_std=x_m.std(), env_delta=x_m.max() - log_env_lpf.mean()) if rec['env_delta'] > db_delta_thresh: recs += [rec] return pd.DataFrame.from_records(recs)
def __test(sr): assert np.allclose(librosa.time_to_samples([0, 1, 2], sr=sr), [0, sr, 2 * sr])
def pipeline(path, frame_ms=30, hop_ms=15, filt=True, noisy=True, shift=True, snr=30): #sig, rate = librosa.load(path) #sig2, rate2 = ad.read_file(path) sig, rate = speech.read_soundfile(path) sig = signal.wiener(sig) fsize = librosa.time_to_samples(float(frame_ms)/1000, rate)[0] hop = librosa.time_to_samples(float(hop_ms)/1000, rate)[0] if filt: sig = bp_filter(sig) if noisy: sig = speech.add_noise(sig, "noise8k/white.flac", snr) frames = librosa.util.frame(sig, fsize, hop) w = signal.hann(fsize) #frames_W = np.zeros_like(frames) #print(frames.shape) #frames = frames.T #print(w.shape) frames_w = np.apply_along_axis(lambda x,w: x*w, 0, frames, w) frames = frames_w frames = np.apply_along_axis(lambda x,w: x/(w+1e-15), 0, frames, w) # frames_W[i] = signal.convolve(frames[i],w, mode='same') #frames = frames_W.T #w = signal.correlate(w,w,mode='full') #w = w[w.size/2:] #print(frames.shape) #frames = sigutil.enframe(sig, fsize, hop, signal.hann) #print("normalized autocorrelation") naccs = np.apply_along_axis(nacc, 0, frames) #print("trimming") naccs = np.apply_along_axis(trim_frame, 0, naccs) lags = np.zeros(len(naccs.T)) acf_n = np.zeros(len(naccs.T)) for i in range(len(naccs.T)): frame = naccs.T[i] relmax = signal.argrelmax(frame)[0] if len(relmax)>0: argmax2 = relmax[0] + np.argmax(frame[relmax[0]:]) else: argmax2 = np.argmax(frame) #print(relmax) """ if len(relmax)>=2: #print(relmax[0], relmax[1], relmax[1]-relmax[0]) lags[i] = relmax[1]-relmax[0] elif len(relmax) == 1: lags[i] = relmax[0] """ lags[i] = argmax2 acf_n[i] = len(relmax) #print(lags[i], len(relmax)) naccs.T[i] = np.roll(frame, -1*argmax2) #minacs = np.zeros_like(naccs) #for i in range(len(naccs.T)): # minacs[:,i] = min_ac(naccs.T, i) meanacs = np.zeros_like(naccs) for i in range(len(naccs.T)): meanacs[:,i] = mean_ac(naccs.T, i) #print(naccs.shape) #print(meanacs.shape) #print("lags") #print("variances") #acvars = np.apply_along_axis(acvar, 0, naccs2) acvars = np.apply_along_axis(acvar, 0, meanacs) #print("ltacs") ltacs = np.zeros_like(acvars) for i in range(len(acvars)): ltacs[i] = ltac(acvars, i) print("done: "+path) return sig, rate, frames, fsize, meanacs, acvars, ltacs, (lags, acf_n)
###################################################### # We'll load in a five-second clip of a track that has # noticeable vocal vibrato. # The method works fine for longer signals, but the # results are harder to visualize. y, sr = librosa.load('audio/Karissa_Hobbs_-_09_-_Lets_Go_Fishin.mp3', sr=44100, duration=5, offset=35) #################################################### # These parameters are taken directly from the paper n_fft = 1024 hop_length = int(librosa.time_to_samples(1./200, sr=sr)) lag = 2 n_mels = 138 fmin = 27.5 fmax = 16000. max_size = 3 ######################################################## # The paper uses a log-frequency representation, but for # simplicity, we'll use a Mel spectrogram instead. S = librosa.feature.melspectrogram(y, sr=sr, n_fft=n_fft, hop_length=hop_length, fmin=fmin, fmax=fmax, n_mels=n_mels)
def meshuggahme(input_file, features, improve_func, onset_dicts, onset_dir, metric='cosine', output_file='output.wav', original_w=9.5): """Converts the given input file into a Meshuggah track and saves it into disk as a wav file. Parameters ---------- input_file : str Path to the input audio file to be converted. features : np.array Model of features to use (either MFCCs or CQT) improve_func : function One of the _improve_ functions (see above) onset_dicts : dictionary Onsets model (see generate_data script) onset_dir : str Path to directory with meshuggah onset audio files metric : str One of the scipy.spatial.distance functions output_file : str Path to the output wav file original_w : float Weight of the original file (the higher the more original audio we'll get) """ y, onset_times, mfcc_sync, cqt_sync, chroma_sync = \ compute_features(input_file) assert mfcc_sync.shape[1] == cqt_sync.shape[1] and \ cqt_sync.shape[1] == chroma_sync.shape[1] # Select feature if features.shape[0] == CQT_BINS: feat_sync = cqt_sync elif features.shape[0] == N_MFCC: feat_sync = mfcc_sync elif features.shape[0] == N_CHROMA: feat_sync = chroma_sync # Improve features features = improve_func(features.T) # Construct for feat, (start, end) in zip(feat_sync.T, zip(onset_times[:-1], onset_times[1:])): # Get start and end times in samples start_end_samples = librosa.time_to_samples(np.array([start, end]), sr=SRATE) # Compute minimum distance from all the matrix of onsets D = distance.cdist(features, improve_func(feat.reshape((1, -1))), metric=metric) argsorted = np.argsort(D.flatten()) # Find onset id with at least the same duration as the meshuggah onset sort_idx = 0 dur = 0 while True: onset_id = argsorted[sort_idx] # Get dictionary onset_dict = onset_dicts[onset_id] # Increase index to go to the next closest meshuggah onset sort_idx += 1 # Try to concatenate x, sr = librosa.load(os.path.join(onset_dir, onset_dict["onset_file"]), sr=SRATE) if len(y[start_end_samples[0]:start_end_samples[1]]) <= len(x): break # Concatenate new audio w = np.min([D[onset_id][0] * original_w, 1]) # Rebalance weight y[start_end_samples[0]:start_end_samples[1]] = \ y[start_end_samples[0]:start_end_samples[1]] * w + \ x[:start_end_samples[1] - start_end_samples[0]] * (1 - w) # Write new audio file librosa.output.write_wav(output_file, y, sr=SRATE)