def analyze(cls, events: List[CorpusEvent], metadata: Metadata) -> List[CorpusEvent]: if not FeatureUtils.is_valid_audio(events, metadata): raise FeatureError( f"Feature '{cls.__name__}' does not support content of " f"type {metadata.content_type.__class__.__name__}") metadata: AudioMetadata = typing.cast(AudioMetadata, metadata) # TODO: Pass rather than hard-code yin_frames: np.ndarray = librosa.yin(metadata.foreground_data, fmin=50, fmax=4186, sr=metadata.sr, frame_length=2048, hop_length=metadata.hop_length) yin_midipitches: np.ndarray = np.round( 12 * np.log2(yin_frames / 8.175798915643707)) for event in events: onset_frame: int = librosa.time_to_frames( event.onset, sr=metadata.sr, hop_length=metadata.hop_length) end_frame: int = librosa.time_to_frames( event.onset + event.duration, sr=metadata.sr, hop_length=metadata.hop_length) hist, _ = np.histogram(yin_midipitches[onset_frame:end_frame], bins=128, range=(0, 128)) pitch: int = int(np.argmax(hist)) event.set_feature(cls(value=pitch)) return events
def main(): # Track beats using time series input song = 'song.mp3' y, sr = librosa.load(song) tempo, beats = librosa.beat.beat_track(y=y, sr=sr) onset_env = librosa.onset.onset_strength(y, sr=sr, aggregate=np.median) tempo, beats = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr) onset_max = np.argmax(onset_env) starting_beat = find_nearest(beats, onset_max) print(starting_beat) range = (beats[starting_beat], beats[starting_beat + 8]) # start_time = librosa.frames_to_time(range[0], sr=sr) # end_time = librosa.frames_to_time(range[1], sr=sr) # print(start_time, end_time) # mel_spectrogram(mp3=song, start_time=start_time, end_time=end_time) play_by_seconds(song,108,120) print(librosa.time_to_frames(108, sr=sr)) print(librosa.time_to_frames(120, sr=sr))
def extract_vocal(path): #print(path) y, sr = librosa.load(path) #print(y) S_full, phase = librosa.magphase(librosa.stft(y)) #print(S_full) idx = slice(*librosa.time_to_frames([30, 35], sr=sr)) S_filter = librosa.decompose.nn_filter(S_full, aggregate=np.median, metric='cosine', width=int(librosa.time_to_frames(2, sr=sr))) S_filter = np.minimum(S_full, S_filter) margin_i, margin_v = 2, 10 power = 2 mask_i = librosa.util.softmask(S_filter, margin_i * (S_full - S_filter), power=power) mask_v = librosa.util.softmask(S_full - S_filter, margin_v * S_filter, power=power) S_foreground = mask_v * S_full S_background = mask_i * S_full D_foreground = S_foreground * phase y_foreground = librosa.istft(D_foreground) D_background = S_background * phase y_background = librosa.istft(D_background) #print(y_foreground) maxv = np.iinfo(np.int16).max scipy.io.wavfile.write("foreground.wav", sr, (y_foreground* maxv).astype(np.int16))
def vocal_removal(self, y, sr): """ https://librosa.github.io/librosa_gallery/auto_examples/plot_vocal_separation.html """ idx = slice(*librosa.time_to_frames([0, 10], sr=sr)) S_full, phase = librosa.magphase(librosa.stft(y)) S_filter = librosa.decompose.nn_filter(S_full, aggregate=np.median, metric='cosine', width=int(librosa.time_to_frames(2, sr=sr))) S_filter = np.minimum(S_full, S_filter) margin_i, margin_v = 2, 10 power = 2 mask_i = librosa.util.softmask(S_filter, margin_i * (S_full - S_filter), power=power) mask_v = librosa.util.softmask(S_full - S_filter, margin_v * S_filter, power=power) S_foreground = mask_v * S_full S_background = mask_i * S_full # Convert back to audio audio_minus_vocals = librosa.core.istft(S_background[:, idx]) return audio_minus_vocals
def initialize_activations_with_onset_templates(signal, mid, pitches, tol_on, tol_off, max_time=None): _n_features, n_samples = signal.S.shape H_init = numpy.zeros((len(pitches) * 2, n_samples)) onset_components_offset = len(pitches) current_time = 0 on_since = {} for msg in mid: if msg.is_meta: continue current_time += msg.time if msg.type == 'note_on': on_since[msg.note] = current_time component = pitch_to_component(msg.note, pitches) start_frame, end_frame = librosa.time_to_frames([current_time - tol_on, current_time + tol_on], sr=signal.sr, hop_length=signal.fft_hop_length) start_frame = max(start_frame, 0) end_frame = min(end_frame, n_samples - 1) H_init[onset_components_offset + component, start_frame:end_frame] = 1 elif msg.type == 'note_off': note_on_since = on_since.pop(msg.note) component = pitch_to_component(msg.note, pitches) start_frame, end_frame = librosa.time_to_frames([note_on_since - tol_on, current_time + tol_off], sr=signal.sr, hop_length=signal.fft_hop_length) start_frame = max(start_frame, 0) end_frame = min(end_frame, n_samples - 1) H_init[component, start_frame:end_frame] = 1 if max_time is not None and current_time > max_time: break return H_init
def load_siamese_data (csvfilepath, num_train_artist): ''' Args: Return: ''' artist_tracks_segments = {} # dict of artist to tracks to vocal segments with open(csvfilepath, 'r') as csv_file : csv_reader = csv.DictReader(csv_file) line_count = 0 for row in csv_reader: if line_count == 0: line_count += 1 curr_artist = int(row['artist_index']) path_to_feat = config.id7d_to_path[config.idmsd_to_id7d[row['track_id']]].replace('.mp3','.npy') start_frames = librosa.time_to_frames(ast.literal_eval(row['vocal_segments']), sr=config.sr, hop_length=config.hop_length, n_fft=config.n_fft) if start_frames[0] < 0: start_frames[0] = 0 try : artist_tracks_segments[curr_artist][path_to_feat] = start_frames except : artist_tracks_segments[curr_artist] = {} artist_tracks_segments[curr_artist][path_to_feat] = start_frames track_list = [] y_list = [] with open(csvfilepath, 'r') as csv_file : singer_list = np.arange(num_train_artist) print ('num_singers:', len(singer_list)) csv_reader = csv.DictReader(csv_file) line_count = 0 for row in csv_reader: if line_count == 0 : line_count +=1 curr_artist_id = int(row['artist_index']) path_to_feat = config.id7d_to_path[config.idmsd_to_id7d[row['track_id']]].replace('.mp3', '.npy') start_frames = librosa.time_to_frames(ast.literal_eval(row['vocal_segments']), sr=config.sr, hop_length=config.hop_length, n_fft=config.n_fft) # train with all vocal segments for i in range(len(start_frames)): if start_frames[i] < 0: start_frames[i] = 0 track_list.append((path_to_feat, start_frames[i])) y_list.append(curr_artist_id) track_list = np.array(track_list) y_list = np.array(y_list) return track_list, y_list, artist_tracks_segments
def invert_test(file_name, input_length=3, input_overlap=False): sr = config.SR # convert seconds to frames n_frames = librosa.time_to_frames(input_length, sr=config.SR, n_fft=config.FFT_SIZE, hop_length=config.FFT_HOP) + 1 if not input_overlap: overlap = n_frames else: overlap = librosa.time_to_frames(input_overlap, sr=config.SR, n_fft=config.FFT_SIZE, hop_length=config.FFT_HOP) # batching data print('Computing spectrogram (w/ librosa) and tags (w/ tensorflow)..', end =" ") batch, spectrogram = batch_data(file_name, n_frames, overlap) audio_rep = (np.power(spectrogram,10.0)-1.0)/10000.0 audio_rep = audio_rep.T audio_out = librosa.feature.inverse.mel_to_audio(M=audio_rep, sr=sr, hop_length=config.FFT_HOP, n_fft=config.FFT_SIZE) sd.play(audio_out, sr) #librosa.save(audio_file_out, sr=config.SR) #audio_rep = audio_rep.astype(np.float16) #audio_rep = np.log10(10000 * audio_rep + 1) return audio_out
def get_phrase_intervals(file_id, y,sr, r, w, w_p_ratio, period_threshold, peak_window, tempo): #param: # y = waveform # sr = sample rate # r = radius/resolution of diagonal cut # w = checkerboard window size in seconds, w<=r # w_p_ratio = ratio used for peak picking, unknown # period_threshold = threshold for filtering by period # fpb = frames per beat for phrase search #TEST CONSTANTS if (w>r): sys.exit('Window Resolution Mismatch') sample_length = librosa.samples_to_time([len(y)],sr)[0] #s w_f = librosa.time_to_frames([w],hop_length=256)[0] f = extract_features(y) #frames per beat fpb = librosa.time_to_frames([1/(tempo/60)],hop_length=256)[0] #LOAD OR CREATE S-MATRIX & NOVELTY VECTOR s_matrix = init_smatrix(file_id,f,r, sample_length) novelty = init_novelty_vector(file_id, w, w_f, sample_length, s_matrix) w_p = w_f/w_p_ratio peaks = librosa.util.peak_pick(novelty, w_p, w_p, w_p, w_p, peak_window, w_p) return filter_by_period(peaks, period_threshold, fpb)
def load_label(audio_spec, audio_label_file): ''' Process and load label for the given audio Args: audio_spec : melgram of the audio. Shape=(n_bins, total_frames) ex.(80,14911) audio_label_file : path to the label file ex. './jamendo/jamendo_lab/02 - The Louis...lab' Return : lab : list of ground truth annotations per frame. Shape=(total_frames, ) ''' with open(audio_label_file, 'r') as f: total_frames = audio_spec.shape[1] label = np.zeros((total_frames, )) for line in f: l = line.strip('\n').split(' ') start = librosa.time_to_frames(float(l[0]), sr=SR, hop_length=HOP_LENGTH) end = librosa.time_to_frames(float(l[1]), sr=SR, hop_length=HOP_LENGTH) is_vocal = 1 if l[2] == 'sing' or l[2] == '1' else 0 # label[start:end] = int(is_vocal) label[start[0]:end[0]] = int(is_vocal) # gwm 23/1/2019 return label
def changeTempo(current_tempo, onset_times, desired_tempo): hi_hat, _ = librosa.load('./Thrown/test_sounds/sfx/closed_hi_hat.wav') drum_hit, _ = librosa.load('./Thrown/test_sounds/sfx/drum_hit.wav') desired_tempo_timing = 60 / desired_tempo current_tempo_timing = 60 / current_tempo scale_factor = desired_tempo_timing / current_tempo_timing onset_frames1 = librosa.time_to_frames(onset_times, sr=sr) clicks1 = librosa.clicks(frames=onset_frames1, sr=sr, click_duration=.01, length=len(t[2][4]), click=hi_hat) sf.write('./Thrown/test_sounds/thrown_w_beat.wav', clicks1 + t[1][2], sr) scaled_onset_times = [] for i in range(0, len(onset_times)): scaled_onset_times.append(onset_times[i] * scale_factor) scaled_beat_frames = [] for i in range(0, len(t[1][8])): scaled_beat_frames.append(t[1][8][i] * scale_factor) scaled_beat_times = [] for i in range(0, len(t[1][6])): scaled_beat_times.append(t[1][6] * scale_factor) onset_frames2 = librosa.time_to_frames(scaled_onset_times, sr=sr) print(onset_frames2) scaled_sample = librosa.effects.time_stretch(t[1][2], 1 / scale_factor) clicks2 = librosa.clicks(frames=onset_frames2, sr=sr, click_duration=.01, length=len(scaled_sample), click=hi_hat) clicks3 = librosa.clicks(frames=scaled_beat_frames, sr=sr, click_duration=.01, length=len(scaled_sample), click=drum_hit) librosa.output.write_wav('./Thrown/test_sounds/thrown_altered_beat.wav', clicks2 + clicks3 + scaled_sample, sr) sf.write('./Thrown/test_sounds/altered_beat.wav', clicks2 + clicks3, sr) plt.figure(figsize=(14, 5)) plt.title("Removing Percussive Sample, Scalability of Tempo Events") plt.vlines(scaled_onset_times, -1, 1, color='c', linestyles='dashed') plt.vlines(scaled_beat_times, -1, 1, color='y', linestyles='dashed') plt.ylim(-1, 1) plt.xlim(0, 5)
def set_windowing(self, width, stride, *option): # called in train.py """Setup windowing process (argument values in seconds).""" self.width = librosa.time_to_frames(width, self.samp_rate) model = str(option[0]) # bulbul requires minimum resolution of 106 if model == "bulbul = pepeiao.models:bulbul" and self.width < 106: self.width = 106 self.stride = librosa.time_to_frames(stride, self.samp_rate) _LOGGER.info('Set width to %d columns', self.width) _LOGGER.info('Set stride to %d columns', self.stride)
def notes_matrix_to_annotation(notes, nframes): binary_annotation_matrix = np.zeros((48, nframes)) full_annotation_matrix = np.zeros((48, nframes, 6)) for note in notes: starting_frame = librosa.time_to_frames(note[0]) duration_frames = librosa.time_to_frames(note[1]) ending_frame = starting_frame + duration_frames note_value, string = int(note[2]) - 35, int(note[3]) binary_annotation_matrix[note_value, starting_frame:ending_frame] = 1 full_annotation_matrix[note_value, starting_frame:ending_frame, string] = 1 return binary_annotation_matrix, full_annotation_matrix
def patch_label(start, end, time_windows, annotate, binary=False, threshold=None): """Labeling a patch given annotation Args: start(float): start time of a patch (in second) end(float): end time of a patch (in second) time_windows(float): time windows for average (in milliseconds) annotation(DataFrame): annotation dataframe for a specific song songname(string): song_name(string) Returns: label(pd.DataFrame): column: instrument index: label eg: S01 S02 label 1 0.93 This code has been largely borrowed from https://github.com/glennq/instrument-recognition/blob/master/data/patch_label.py """ #Transfer time to frame annotation = copy.copy(annotate) start_frame = librosa.time_to_frames(start, sr=1 / 0.0464, hop_length=1) + 1 end_frame = librosa.time_to_frames(end, sr=1 / 0.0464, hop_length=1) - 1 moving_frame = librosa.time_to_frames( time_windows / 1000, sr=1 / 0.0464, hop_length=1) - librosa.time_to_frames(0, sr=1 / 0.0464, hop_length=1) #Pick annotation annotation = annotation.reset_index(drop=True) annotation.index += 1 time_annot = annotation.loc[start_frame:end_frame].drop('time', 1) #Using maximum value of average in moving windows as label music_ins = time_annot.columns label = pd.DataFrame(index=[ 'label', ], columns=music_ins) for j in range(len(time_annot.columns)): label_temp = max([ sum(list(time_annot.ix[:, j])[i:i + moving_frame + 1]) / float(moving_frame + 1) for i in range(len(time_annot.ix[:, j]) - moving_frame) ]) #binary output if binary and threshold: if label_temp >= threshold: label_temp = float(1) else: label_temp = float(0) label.ix[:, j] = label_temp return label
def __init__(self, model='MTT_musicnn', input_length=3, input_overlap=False): # select model if 'MTT' in model: self.labels = config.MTT_LABELS elif 'MSD' in model: self.labels = config.MSD_LABELS else: raise RuntimeError("Bad model name") self.num_classes = len(self.labels) if 'vgg' in model and input_length != 3: raise ValueError( 'Set input_length=3, the VGG models cannot handle different input lengths.' ) # convert seconds to frames self.n_frames = librosa.time_to_frames(input_length, sr=config.SR, n_fft=config.FFT_SIZE, hop_length=config.FFT_HOP) + 1 if not input_overlap: self.overlap = self.n_frames else: self.overlap = librosa.time_to_frames(input_overlap, sr=config.SR, n_fft=config.FFT_SIZE, hop_length=config.FFT_HOP) # tensorflow: define the model tf.compat.v1.reset_default_graph() with tf.name_scope('model'): self.x = tf.compat.v1.placeholder( tf.float32, [None, self.n_frames, config.N_MELS]) self.train = tf.compat.v1.placeholder(tf.bool) if 'vgg' in model: y, _, _, _, _, _ = models.define_model(self.x, self.train, model, self.num_classes) else: y, _, _, _, _, _, _, _, _ = models.define_model( self.x, self.train, model, self.num_classes) self.pred = tf.nn.sigmoid(y) config_tf = tf.compat.v1.ConfigProto() config_tf.gpu_options.allow_growth = True self.sess = tf.compat.v1.Session(config=config_tf) self.sess.run(tf.compat.v1.global_variables_initializer()) saver = tf.compat.v1.train.Saver() saver.restore(self.sess, os.path.dirname(__file__) + '/' + model + '/')
def generate_annotation_matrices(annotation, frames): ''' This function will return a one hot encoded matrix of notes being played The annotation matrix will start w/ note 25 at index 0 and go up to note 100 The highest and lowest values that I saw in the annotations seemed to be arounnd 29-96 so give a little leeway :return: ''' annotation_matrix = np.zeros((84, frames)) for note in annotation: starting_frame = time_to_frames(note[1]) duration_frames = time_to_frames(note[2] - note[1]) note_value = note[0] annotation_matrix[note_value - 25][starting_frame:starting_frame + duration_frames] = 1 return annotation_matrix.T
def read_ann_beats(self): """Reads the annotated beats if available. Returns ------- times: np.array Times of annotated beats in seconds. frames: np.array Frame indeces of annotated beats. """ times, frames = (None, None) # Read annotations if they exist in correct folder if os.path.isfile(self.file_struct.ref_file): try: jam = jams.load(self.file_struct.ref_file) except TypeError: logging.warning( "Can't read JAMS file %s. Maybe it's not " "compatible with current JAMS version?" % self.file_struct.ref_file) return times, frames beat_annot = jam.search(namespace="beat.*") # If beat annotations exist, get times and frames if len(beat_annot) > 0: beats_inters, _ = beat_annot[0].to_interval_values() times = beats_inters[:, 0] frames = librosa.time_to_frames(times, sr=self.sr, hop_length=self.hop_length) return times, frames
def getIntervalFromJAMS(path): j = jams.load(path) res = [] for i in zip(list(j.annotations[0].data.time), list(j.annotations[0].data.time + j.annotations[0].data.duration), j.annotations[0].data.value): v = [[librosa.time_to_frames([i[0].total_seconds(), i[1].total_seconds()]), i[2].encode("ascii")]] res += v return res
def split_vocal(self, y): S_full, phase = librosa.magphase(librosa.stft(y)) # To avoid being biased by local continuity, we constrain similar frames to be # separated by at least 1.2 seconds. S_filter = librosa.decompose.nn_filter( S_full, aggregate=np.median, metric='cosine', width=int(librosa.time_to_frames(self._constrained, sr=self._sr))) S_filter = np.minimum(S_full, S_filter) margin_v = 10 power = 2 mask_v = librosa.util.softmask(S_full - S_filter, margin_v * S_filter, power=power) S_foreground = mask_v * S_full foreground = griffinlim(S_foreground) return foreground
def get_frame(self) -> int: if Beat.INDEX_VALUE == 'samples': return librosa.samples_to_frames(self.index, hop_length=util.HOP_LENGTH) elif Beat.INDEX_VALUE == 'time': return librosa.time_to_frames(self.index, sr=util.SAMPLE_RATE, hop_length=util.HOP_LENGTH) else: raise NotImplementedError("Only samples and time are supported")
def __call__(self, sample): y, sr = sample['wav'] y, sr = librosa.resample(y, sr, 22050), 22050 ref_times, ref_freqs = sample['gt'] fft_f = librosa.fft_frequencies(sr, self.n_fft) f_interp = interp1d(librosa.time_to_frames(ref_times, sr, self.hop_length, self.n_fft), ref_freqs, fill_value=0.0, bounds_error=False) fft = librosa.stft(y, self.n_fft, self.hop_length) n_fft = np.zeros(fft.shape, dtype=fft.dtype) for frame in range(fft.shape[1]): freq = f_interp(frame) for i in range(self.n_harmonics): idx = np.argmin(np.abs(fft_f - freq * (i + 1))) if np.abs(fft_f[idx] - freq * (i + 1)) < fft_f[idx] * (2**(1 / 24) - 1): n_fft[idx, frame] = fft[idx, frame] else: fft[:, frame] = 0 y = librosa.istft(n_fft, self.hop_length) y = y / max(y) if self.new_key is None: sample['wav'] = y, sr else: sample[self.new_key] = y, sr return sample
def read_ann_beats(self): """Reads the annotated beats if available. Returns ------- times: np.array Times of annotated beats in seconds. frames: np.array Frame indeces of annotated beats. """ times, frames = (None, None) # Read annotations if they exist in correct folder if os.path.isfile(self.file_struct.ref_file): jam = jams.load(self.file_struct.ref_file) beat_annot = jam.search(namespace="beat.*") # If beat annotations exist, get times and frames if len(beat_annot) > 0: beats_inters, _ = beat_annot[0].data.to_interval_values() times = beats_inters[:, 0] frames = librosa.time_to_frames(times, sr=self.sr, hop_length=self.hop_length) return times, frames
def make_sampler(max_samples, duration, pump, seed): n_frames = librosa.time_to_frames(duration, sr=pump['mel'].sr, hop_length=pump['mel'].hop_length)[0] return pump.sampler(max_samples, n_frames, random_state=seed)
def process_config(self, config): ''' preprocess config ''' data_conf = config['data'] class_vocab = data_conf['task']['classes']['vocab'] assert len(class_vocab) == data_conf['task']['classes']['num'] # add revere_vocab, positive_id reverse_vocab = {val: key for key, val in class_vocab.items()} data_conf['task']['classes']['reverse_vocab'] = reverse_vocab # binary class pos_id = config['solver']['metrics']['pos_label'] data_conf['task']['classes']['positive_id'] = pos_id data_conf['task']['classes']['positive'] = reverse_vocab[pos_id] # add feature shape, withoud batch_size if data_conf['task']['suffix'] == '.npy': input_channels = 3 if data_conf['task']['audio'][ 'add_delta_deltas'] else 1 nframe = librosa.time_to_frames( data_conf['task']['audio']['clip_size'], sr=data_conf['task']['audio']['sr'], hop_length=data_conf['task']['audio']['winstep'] * data_conf['task']['audio']['sr']) feature_shape = [ nframe, data_conf['task']['audio']['feature_size'], input_channels ] else: feature_shape = [ data_conf['task']['audio']['sr'] * data_conf['task']['audio']['clip_size'] ] data_conf['task']['audio']['feature_shape'] = feature_shape return config
def read_ann_beats(self): """Reads the annotated beats if available. Returns ------- times: np.array Times of annotated beats in seconds. frames: np.array Frame indeces of annotated beats. """ times, frames = (None, None) # Read annotations if they exist in correct folder if os.path.isfile(self.file_struct.ref_file): try: jam = jams.load(self.file_struct.ref_file) except TypeError: logging.warning("Can't read JAMS file %s. Maybe it's not " "compatible with current JAMS version?" % self.file_struct.ref_file) return times, frames beat_annot = jam.search(namespace="beat.*") # If beat annotations exist, get times and frames if len(beat_annot) > 0: beats_inters, _ = beat_annot[0].data.to_interval_values() times = beats_inters[:, 0] frames = librosa.time_to_frames(times, sr=self.sr, hop_length=self.hop_length) return times, frames
def process_one_file(midi_filename, skip=True): ''' Load in midi data, compute features, and write out file :parameters: - midi_filename : str Full path to midi file - skip : bool Whether to skip creating the file when the npz already exists ''' # npz files go in the 'npz' dir instead of 'mid' output_filename = mid_to_npz_path(midi_filename) # Skip files already created if skip and os.path.exists(output_filename): return try: m = pretty_midi.PrettyMIDI(midi_filename) midi_audio = alignment_utils.fast_fluidsynth(m, MIDI_FS) midi_gram = librosa.cqt( midi_audio, sr=MIDI_FS, hop_length=MIDI_HOP, fmin=librosa.midi_to_hz(NOTE_START), n_bins=N_NOTES) midi_beats, midi_tempo = alignment_utils.midi_beat_track(m) midi_sync_gram = alignment_utils.post_process_cqt( midi_gram, librosa.time_to_frames( midi_beats, sr=MIDI_FS, hop_length=MIDI_HOP)) np.savez_compressed( output_filename, sync_gram=midi_sync_gram, beats=midi_beats, bpm=midi_tempo) except Exception as e: print "Error processing {}: {}".format(midi_filename, e)
def get_vocal(self): # And compute the spectrogram magnitude and phase S_full, phase = librosa.magphase(librosa.stft(self.wave)) S_filter = librosa.decompose.nn_filter( S_full, aggregate=numpy.median, metric='cosine', width=int(librosa.time_to_frames(2, sr=self.sample_rate))) S_filter = numpy.minimum(S_full, S_filter) margin_i, margin_v = 2, 10 power = 2 mask_i = librosa.util.softmask(S_filter, margin_i * (S_full - S_filter), power=power) mask_v = librosa.util.softmask(S_full - S_filter, margin_v * S_filter, power=power) S_foreground = mask_v * S_full comps, acts = librosa.decompose.decompose(S_foreground, n_components=16, sort=True) # decomposition if numpy.count_nonzero(comps) < 10000: return 0 # no vocal else: return 1 # vocal
def split_vocal_to_wav(filename, fp_foreground, fp_background=None): y, sr = librosa.load(filename, sr=16000) S_full, phase = librosa.magphase(librosa.stft(y)) S_filter = librosa.decompose.nn_filter(S_full, aggregate=np.median, metric='cosine', width=int( librosa.time_to_frames(2, sr=sr))) S_filter = np.minimum(S_full, S_filter) margin_i, margin_v = 2, 10 power = 2 mask_i = librosa.util.softmask(S_filter, margin_i * (S_full - S_filter), power=power) mask_v = librosa.util.softmask(S_full - S_filter, margin_v * S_filter, power=power) S_foreground = mask_v * S_full S_background = mask_i * S_full foreground = griffinlim(S_foreground) fp_foreground += filename.split('/')[-1] sf.write(fp_foreground, foreground, sr, 'PCM_16') if fp_background is not None: background = griffinlim(S_background) sf.write(fp_background, background, sr, 'PCM_16')
def compute_all_features(file_struct, sonify_beats=False, overwrite=False, out_beats="out_beats.wav"): """Computes all the features for a specific audio file and its respective human annotations. It creates an audio file with the sonified estimated beats if needed. Parameters ---------- file_struct: FileStruct Object containing all the set of file paths of the input file. sonify_beats: bool Whether to sonify the beats. overwrite: bool Whether to overwrite previous features JSON file. out_beats: str Path to the new file containing the sonified beats. """ # Output file out_file = file_struct.features_file if os.path.isfile(out_file) and not overwrite: return # Do nothing, file already exist and we are not overwriting it # Compute the features for the given audio file features = compute_features_for_audio_file(file_struct.audio_file) # Save output as audio file if sonify_beats: logging.info("Sonifying beats...") fs = 44100 audio, sr = librosa.load(file_struct.audio_file, sr=fs) msaf.utils.sonify_clicks(audio, features["beats"], out_beats, fs, offset=0.0) # Read annotations if they exist in path/references_dir/file.jams if os.path.isfile(file_struct.ref_file): jam = jams.load(file_struct.ref_file) beat_annot = jam.search(namespace="beat.*") # If beat annotations exist, compute also annotated beatsync features if len(beat_annot) > 0: logging.info("Reading beat annotations from JAMS") annot_beats_inters, _ = beat_annot[0].data.to_interval_values() annot_beats_times = annot_beats_inters[:, 0] annot_beats_idx = librosa.time_to_frames( annot_beats_times, sr=msaf.Anal.sample_rate, hop_length=msaf.Anal.hop_size) features["ann_mfcc"], features["ann_hpcp"], \ features["ann_tonnetz"], features["ann_cqt"] = \ compute_beat_sync_features(features, annot_beats_idx) # Save output as json file save_features(out_file, features)
def _analyze_audio(cls, events: List[AudioCorpusEvent], metadata: AudioMetadata): # shape: (12, k) where k is measured in frames chroma = librosa.feature.chroma_stft(y=metadata.background_data, sr=metadata.sr, hop_length=metadata.hop_length, n_chroma=12, n_fft=8192) # TODO: Pass as parameters for event in events: onset_frame: int = librosa.time_to_frames(event.onset, sr=metadata.sr, hop_length=metadata.hop_length) event.set_feature(cls(chroma[:, onset_frame]))
def __init__(self, audiofilepath): self.y, self.sr = librosa.load(audiofilepath, sr=global_sr) self.yshape = self.y.shape self.original = self.y.copy() self.newest_y = self.y.copy() self.duration = librosa.get_duration(y=self.y, sr=global_sr) self.timestamps = np.linspace(0, self.duration, int(global_sr * self.duration)) self.frames_index = librosa.time_to_frames(self.timestamps, sr=global_sr, hop_length=global_hop_len) self.frames_num = max(self.frames_index) self.action_box = None self.audio_frames = [] self.mels = [] self.best_reward = -1 # self.step = 0 self.mark = 1 self.locked = [] self.best_mismatches = None self.shft, self.stch = 0, 1 self.output_list = [] self.log_for_sox =[] self.frame_log = [] self.action_history = [-1 for _ in range(self.frames_num)] self.epoch_history = {} self.shifted = 0 self.stretched = 1 self.pitched = 0 self.vlines_widths = 0.75 self.vlines_colors = "Blue" self.backward_limit = self.duration * 0.1 self.forward_limit = self.duration * 0.1
def test(): base_dir = "../data/train/" y, sr = librosa.load(base_dir + "00ad36516.flac") chroma_orig = librosa.feature.chroma_cqt(y=y, sr=sr) # For display purposes, let's zoom in on a 15-second chunk from the middle of the song idx = tuple([slice(None), slice(*list(librosa.time_to_frames([45, 60])))]) # And for comparison, we'll show the CQT matrix as well. C = np.abs( librosa.cqt(y=y, sr=sr, bins_per_octave=12 * 3, n_bins=7 * 12 * 3)) fig, ax = plt.subplots(nrows=2, sharex=True) img1 = librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max)[idx], y_axis='cqt_note', x_axis='time', bins_per_octave=12 * 3, ax=ax[0]) fig.colorbar(img1, ax=[ax[0]], format="%+2.f dB") ax[0].label_outer() img2 = librosa.display.specshow(chroma_orig[idx], y_axis='chroma', x_axis='time', ax=ax[1]) fig.colorbar(img2, ax=[ax[1]]) ax[1].set(ylabel='Default chroma') plt.show()
def process_config(self, config): data_conf = config['data'] feature_shape = data_conf['task']['audio'].get('feature_shape', None) if not feature_shape: # add feature shape, withoud batch_size if data_conf['task']['suffix'] == '.npy': input_channels = 3 if data_conf['task']['audio'][ 'add_delta_deltas'] else 1 nframe = librosa.time_to_frames( data_conf['task']['audio']['clip_size'], sr=data_conf['task']['audio']['sr'], hop_length=data_conf['task']['audio']['winstep'] * data_conf['task']['audio']['sr']) feature_shape = [ nframe, data_conf['task']['audio']['feature_size'], input_channels ] else: feature_shape = [ data_conf['task']['audio']['sr'] * data_conf['task']['audio']['clip_size'] ] data_conf['task']['audio']['feature_shape'] = feature_shape logging.info(f"FEATURE SHAPE: {feature_shape}") return config
def updateBackground (self, currenttime): framenumber = librosa.time_to_frames([currenttime/1000]) currentmaximumvolume = max(self.transposed [framenumber[0]]) - self.minimumVol if currentmaximumvolume < self.averageVol: percentmaxvolume = int(50*(currentmaximumvolume/self.averageVol)) else: percentmaxvolume = int(50 + (50*((currentmaximumvolume-self.averageVol)/(self.maximumVol-self.averageVol)))) print("Vol :", currentmaximumvolume) print("Ave Vol:", self.averageVol) print("Per Vol:", percentmaxvolume, "%") if percentmaxvolume <= 50: rgb = int(255*percentmaxvolume/50) rgbtuple = (rgb, 255, 0) #self._canvas.configure (background = '#%02x%02x%02x' % (rgb, 255, 0)) #print("RGB :", "(" + str(rgb) + ", " + str(255) + ", " + "0)") else: rgb = int(255*(percentmaxvolume-50)/50) rgbtuple = (255, 255-rgb, 0) # self._canvas.configure (background = '#%02x%02x%02x' % (255, 255-rgb, 0)) #print("RGB :", "(" + str(255) + ", " + str(255-rgb) + ", " + "0)") closebeat = self.getClosestBeats (currenttime) percentage = self.scaleBetween (self.getPercentCloseToBeat(currenttime, closebeat), 0.6, 1) #rgbtuple = (255,0,0) rgbtuple = (int (rgbtuple[0]*percentage), int (rgbtuple [1] * percentage), int (rgbtuple[2]*percentage)) self._canvas.configure (background = '#%02x%02x%02x' % (rgbtuple))
def load_data_segment(picklefile, artist_list): train_data = [] artist_names = [] f = pickle.load(open(picklefile, 'rb')) artist_to_id = {} for u in range(len(artist_list)): artist_to_id[artist_list[u]] = u for artist_id, tracks in f.items(): for track_id, svd in tracks.items(): center_segs = svd[len(svd) // 2 - 10:len(svd) // 2 + 10] # center_segs = svd[len(svd)//2 - 5 : len(svd)//2 + 5] start_frames = librosa.time_to_frames(center_segs, sr=22050, hop_length=512, n_fft=1024) for i in range(len(start_frames)): start_frame = start_frames[i] if start_frame < 0: start_frame = 0 # train_data.append((artist_to_id[artist_id], track_id + '.npy', start_frame)) ### augmentation train_data.append( (artist_to_id[artist_id], track_id + '.npy', start_frame)) # train_data.append((artist_to_id[artist_id], track_id + '.npy', start_frame, 1 )) artist_names.append(artist_id) artist_names.append(artist_id) return train_data, artist_names
def annotation_to_mat(ref_intervals, ref_labels, beats, sr=SAMPLE_RATE, hop_length=HOP_SIZE): truth_dict = {} for label, interval in zip(ref_labels, ref_intervals): frames = librosa.time_to_frames(interval, sr=sr, hop_length=hop_length) if label in truth_dict: truth_dict[label]['times'].append(interval) truth_dict[label]['frames'].append(frames) else: truth_dict[label] = {'times': [interval], 'frames': [frames]} for k in truth_dict: truth_dict[k]['beats'] = [] for interval in truth_dict[k]['frames']: beat_interval = np.argmin(np.abs([interval[0]-beats, interval[1]-beats]), axis=1) truth_dict[k]['beats'].append(beat_interval) truth_mat_beat = np.zeros((len(beats)-1, len(beats)-1)) for k in truth_dict: for i_b in truth_dict[k]['beats']: for j_b in truth_dict[k]['beats']: if np.array_equal(i_b, j_b): truth_mat_beat[i_b[0]:i_b[1], j_b[0]:j_b[1]] = 1 else: truth_mat_beat[i_b[0]:i_b[1], j_b[0]:j_b[1]] = 0.9 return truth_mat_beat
def compute_all_features(file_struct, sonify_beats=False, overwrite=False, out_beats="out_beats.wav"): """Computes all the features for a specific audio file and its respective human annotations. It creates an audio file with the sonified estimated beats if needed. Parameters ---------- file_struct: FileStruct Object containing all the set of file paths of the input file. sonify_beats: bool Whether to sonify the beats. overwrite: bool Whether to overwrite previous features JSON file. out_beats: str Path to the new file containing the sonified beats. """ # Output file out_file = file_struct.features_file if os.path.isfile(out_file) and not overwrite: return # Do nothing, file already exist and we are not overwriting it # Compute the features for the given audio file features = compute_features_for_audio_file(file_struct.audio_file) # Save output as audio file if sonify_beats: logging.info("Sonifying beats...") fs = 44100 audio, sr = librosa.load(file_struct.audio_file, sr=fs) msaf.utils.sonify_clicks(audio, features["beats"], out_beats, fs, offset=0.0) # Read annotations if they exist in path/references_dir/file.jams if os.path.isfile(file_struct.ref_file): jam = jams2.load(file_struct.ref_file) # If beat annotations exist, compute also annotated beatsync features if jam.beats != []: logging.info("Reading beat annotations from JAMS") annot = jam.beats[0] annot_beats = [] for data in annot.data: annot_beats.append(data.time.value) annot_beats = np.unique(annot_beats) annot_beats_idx = librosa.time_to_frames( annot_beats, sr=msaf.Anal.sample_rate, hop_length=msaf.Anal.hop_size) features["ann_mfcc"], features["ann_hpcp"], \ features["ann_tonnetz"], features["ann_cqt"],\ features["ann_gmt"] = \ compute_beat_sync_features(features, annot_beats_idx) # Save output as json file save_features(out_file, features)
def bpm(self): """Computes tempo of a signal in Beats Per Minute with its tempo onsets""" self.onsets_strength() n = len(self.envelope) win_length = np.asscalar(time_to_frames(8.0, self.fs, self.H)) ac_window = hann(win_length) self.envelope = np.pad(self.envelope, int(win_length // 2),mode='linear_ramp', end_values=[0, 0]) frames = 1 + int((len(self.envelope) - win_length) / 1) f = [] for i in range(win_length): f.append(self.envelope[i:i+frames]) f = np.array(f)[:,:n] self.windowed_x = f * ac_window[:, np.newaxis] self.autocorrelation() tempogram = np.mean(self.correlation, axis = 1, keepdims = True) bin_frequencies = np.zeros(int(tempogram.shape[0]), dtype=np.float) bin_frequencies[0] = np.inf bin_frequencies[1:] = 60.0 * self.fs / (self.H * np.arange(1.0, tempogram.shape[0])) prior = np.exp(-0.5 * ((np.log2(bin_frequencies) - np.log2(80)) / bin_frequencies[1:].std())**2) max_indexes = np.argmax(bin_frequencies < 208) min_indexes = np.argmax(bin_frequencies < 80) prior[:max_indexes] = 0 prior[min_indexes:] = 0 p = prior.nonzero() best_period = np.argmax(tempogram[p] * prior[p][:, np.newaxis] * -1, axis=0) self.tempo = bin_frequencies[p][best_period] period = round(60.0 * (self.fs/self.H) / self.tempo[0]) window = np.exp(-0.5 * (np.arange(-period, period+1)*32.0/period)**2) localscore = convolve(self.envelope/self.envelope.std(ddof=1), window, 'same') backlink, cumscore = dp(localscore, period, 100) self.ticks = [last_beat(cumscore)] while backlink[self.ticks[-1]] >= 0: self.ticks.append(backlink[self.ticks[-1]]) self.ticks = np.array(self.ticks[::-1], dtype=int) self.ticks = trim_beats(localscore, self.ticks, False) * self.H if not len(self.ticks) >= 2: raise ValueError(("Only found one single onset, can't make sure if the beat is correct")) interv_value = self.ticks[1] - self.ticks[0] #these are optimal beat locations interval = 0 self.ticks = [] for i in range(int(self.signal.size/interv_value)): self.ticks.append(interval + interv_value) interval += interv_value #compute tempo frames locations based on the beat location value self.ticks = np.array(self.ticks) / self.fs return self.tempo, self.ticks
def compute_features(audio_file, intervals, level): """Computes the subseg-sync cqt features from the given audio file, if they are not previously computed. Saves the results in the feat_dir folder. Parameters ---------- audio_file : str Path to the audio file. intervals : np.array Intervals containing the estimated boundaries. level : str Level in the hierarchy. Returns ------- cqgram : np.array Subseg-sync constant-Q power spectrogram. intframes : np.array The frame indeces. """ # Check if features have already been computed if level == "small_scale": features_file = os.path.join(features_dir, os.path.basename(audio_file).split('.')[0] + "_small_scale.mp3.pk") else: features_file = os.path.join(features_dir, os.path.basename(audio_file) + ".pk") if os.path.isfile(features_file): return read_features(features_file) y, sr = librosa.load(audio_file, sr=11025) # Default hopsize is 512 hopsize = 512 cqgram = librosa.logamplitude(librosa.cqt(y, sr=sr, hop_length=hopsize)**2, ref_power=np.max) # Track beats y_harmonic, y_percussive = librosa.effects.hpss(y) tempo, beats = librosa.beat.beat_track(y=y_percussive, sr=sr, hop_length=hopsize) # Synchronize cqgram = librosa.feature.sync(cqgram, beats, aggregate=np.median) intframes = None if intervals is not None: # convert intervals to frames intframes = librosa.time_to_frames(intervals, sr=sr, hop_length=hopsize) # Match intervals to subseg points intframes = librosa.util.match_events(intframes, beats) # Save the features save_features(cqgram, intframes, beats, features_file) return cqgram, intframes
def encode_intervals(self, duration, intervals, values, dtype=np.bool): frames = librosa.time_to_frames(intervals, sr=self.sr, hop_length=self.hop_length) n_total = int(librosa.time_to_frames(duration, sr=self.sr, hop_length=self.hop_length)) values = values.astype(dtype) target = np.empty((n_total, values.shape[1]), dtype=dtype) target.fill(fill_value(dtype)) for column, interval in zip(values, frames): target[interval[0]:interval[1]] += column return target
def patch_label(start, end, time_windows, annotate, binary=False, threshold=None): """Labeling a patch given annotation Args: start(float): start time of a patch (in second) end(float): end time of a patch (in second) time_windows(float): time windows for average (in milliseconds) annotation(DataFrame): annotation dataframe for a specific song songname(string): song_name(string) Returns: label(pd.DataFrame): column: instrument index: label eg: S01 S02 label 1 0.93 """ #Transfer time to frame annotation = copy.copy(annotate) start_frame = librosa.time_to_frames(start, sr=1/0.0464, hop_length=1)+1 end_frame = librosa.time_to_frames(end, sr=1/0.0464, hop_length=1)-1 moving_frame = librosa.time_to_frames(time_windows/1000, sr=1/0.0464, hop_length=1)-librosa.time_to_frames(0, sr=1/0.0464, hop_length=1) #Pick annotation annotation = annotation.reset_index(drop=True) annotation.index += 1 time_annot = annotation.loc[start_frame:end_frame].drop('time', 1) #Using maximum value of average in moving windows as label music_ins = time_annot.columns label = pd.DataFrame(index = ['label',], columns=music_ins) for j in range(len(time_annot.columns)): label_temp = max([sum(list(time_annot.ix[:, j])[i:i+moving_frame+1])/float(moving_frame+1) for i in range(len(time_annot.ix[:, j])-moving_frame)]) #binary output if binary and threshold: if label_temp >= threshold: label_temp = float(1) else: label_temp = float(0) label.ix[:, j] = label_temp return label
def encode_events(self, duration, events, values, dtype=np.bool): '''Encode labeled events as a time-series matrix. Parameters ---------- duration : number The duration of the track events : ndarray, shape=(n,) Time index of the events values : ndarray, shape=(n, m) Values array. Must have the same first index as `events`. dtype : numpy data type Returns ------- target : ndarray, shape=(n_frames, n_values) ''' # FIXME: support sparse encoding frames = librosa.time_to_frames(events, sr=self.sr, hop_length=self.hop_length) n_total = int(librosa.time_to_frames(duration, sr=self.sr, hop_length=self.hop_length)) target = np.empty((n_total, values.shape[1]), dtype=dtype) target.fill(fill_value(dtype)) values = values.astype(dtype) for column, event in zip(values, frames): target[event] += column return target
def __test(sr, hop_length, n_fft): # Generate frames at times 0s, 1s, 2s times = np.arange(3) frames = librosa.time_to_frames(times, sr=sr, hop_length=hop_length, n_fft=n_fft) if n_fft: frames -= n_fft // (2 * hop_length) # we need to be within one frame assert np.all(np.abs(times - np.asarray([0, 1, 2])) * sr < hop_length)
def _get_beats(self): """ Gets beats using librosa's beat tracker. """ _, beat_frames = librosa.beat.beat_track(y=self.analysis_samples, sr=self.analysis_sample_rate, trim=False) # pad beat times to full duration f_max = librosa.time_to_frames(self.duration, sr=self.analysis_sample_rate) beat_frames = librosa.util.fix_frames(beat_frames, x_min=0, x_max=f_max) # convert frames to times beat_times = librosa.frames_to_time(beat_frames, sr=self.analysis_sample_rate) # make the list of (start, duration) tuples that TimingList expects starts_durs = [(s, t-s) for (s, t) in zip(beat_times, beat_times[1:])] return starts_durs
def test_clicks(): def __test(times, frames, sr, hop_length, click_freq, click_duration, click, length): y = librosa.clicks(times=times, frames=frames, sr=sr, hop_length=hop_length, click_freq=click_freq, click_duration=click_duration, click=click, length=length) if times is not None: nmax = librosa.time_to_samples(times, sr=sr).max() else: nmax = librosa.frames_to_samples(frames, hop_length=hop_length).max() if length is not None: assert len(y) == length elif click is not None: assert len(y) == nmax + len(click) test_times = np.linspace(0, 10.0, num=5) # Bad cases yield raises(librosa.ParameterError)(__test), None, None, 22050, 512, 1000, 0.1, None, None yield raises(librosa.ParameterError)(__test), test_times, None, 22050, 512, 1000, 0.1, np.ones((2, 10)), None yield raises(librosa.ParameterError)(__test), test_times, None, 22050, 512, 1000, 0.1, None, 0 yield raises(librosa.ParameterError)(__test), test_times, None, 22050, 512, 0, 0.1, None, None yield raises(librosa.ParameterError)(__test), test_times, None, 22050, 512, 1000, 0, None, None for sr in [11025, 22050]: for hop_length in [512, 1024]: test_frames = librosa.time_to_frames(test_times, sr=sr, hop_length=hop_length) for click in [None, np.ones(sr // 10)]: for length in [None, 5 * sr, 15 * sr]: yield __test, test_times, None, sr, hop_length, 1000, 0.1, click, length yield __test, None, test_frames, sr, hop_length, 1000, 0.1, click, length
def evaluate_phrases(benchmark, detected, window): #return score based on how well the detected phrases match up to the benchmark\ #hits / maxHits - misses / maxMisses #convert to frames benchmark = librosa.time_to_frames(benchmark,hop_length=256) print(benchmark) print(detected) hits = 0 for i in range(0, len(benchmark)): target = benchmark[i] h_i = hitIndex(target, detected, window) if (h_i != -1): hits += 1 max_hits = len(benchmark) max_misses = len(benchmark) + len(detected) misses = max_misses - (hits*2) return (hits/max_hits) - (misses/max_misses)
def transform(self, jam, query=None): anns = [] if query: results = jam.search(**query) else: results = jam.annotations # Find annotations that can be coerced to our target namespace for ann in results: try: anns.append(jams.nsconvert.convert(ann, self.namespace)) except jams.NamespaceError: pass duration = jam.file_metadata.duration # If none, make a fake one if not anns: anns = [self.empty(duration)] # Apply transformations results = [] for ann in anns: results.append(self.transform_annotation(ann, duration)) # If the annotation range is None, it spans the entire track if ann.time is None or ann.duration is None: valid = [0, duration] else: valid = [ann.time, ann.time + ann.duration] results[-1]['_valid'] = librosa.time_to_frames(valid, sr=self.sr, hop_length=self.hop_length) # Prefix and collect return self.merge(results)
def get_smatrix_diagonal(f, r): #param: f=feature array, r=radius/resolution of of matrix sampled from the diagonal in seconds #optimized based on the assumption that only information along the diagonal of the matrix is important print('Computing S-Matrix Diagonal') #convert r in seconds to r in frames r_f = librosa.time_to_frames(np.array([r]), sr=22050, hop_length=256)[0] dim = len(f[0]) if (r_f > dim): r_f = -dim matrix = np.zeros([dim, dim]) i_max = int(dim) for i in range(0,dim): sys.stdout.write("\r" + str(int((i+1)/dim*100)) + '%') sys.stdout.flush() for j in range(0,r_f*2): i_r = i j_r = max(min(j + i-r_f,dim-1),0) matrix[i_r][j_r] = feature_distance(f[:,i_r], f[:,j_r]) sys.stdout.write("\n") return matrix
start, end, labels = [], [], [] with open('dizquefuiporai8.lab') as infile: print "Start / End / Label" for line in infile: print line fields = line.split() start.append(float(fields[0])) end.append(float(fields[1])) labels.append(int(fields[2])) start = np.array(start) end = np.array(end) labels = np.array(labels) # %% Convert the time stamps into frame indices start_frames = librosa.time_to_frames(start, sr=sr) end_frames = librosa.time_to_frames(end, sr=sr) # %% Overlay the section markers with the mel-frequency spectrogram plt.figure(figsize=(12, 6)) librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel', n_xticks=20) # Overlay with the detected beats colors = ['r', 'c', 'orange', 'b', 'k', 'g', 'm', 'y'] for i in range(len(labels)): lines = np.arange(start_frames[i], end_frames[i]) plt.vlines(lines, 0, log_S.shape[0], colors=colors[labels[i] % len(colors)], linestyles='-', linewidth=2, alpha=0.01)
def compute_features(audio_file): """Computes the onsets and the MFCC and CQT onset-synchronous features from the given audio file path. Parameters ---------- audio_file : str Path to the audio file Returns ------- y : np.array Audio samples onset_times : np.array Onset times in seconds mfcc_sync : np.array MFCC synchronized to the onsets cqt_sync : np.array CQT features synchronized to the onsets chroma_sync : np.array Chroma features synchronized to the onsets """ # Read audio file y, sr = librosa.load(audio_file, sr=SRATE) # Detect onset onsets = librosa.onset.onset_detect(y, sr=SRATE, hop_length=HOP_SIZE) onset_times = librosa.frames_to_time(onsets, sr=SRATE, hop_length=HOP_SIZE) # Add first and last onsets (start and end of track) dur = librosa.core.get_duration(y=y, sr=SRATE, hop_length=HOP_SIZE) if onset_times[0] != 0: onset_times = np.concatenate(([0], onset_times)) if onset_times[-1] != dur: onset_times = np.concatenate((onset_times, [dur])) # Compute MFCC (timbre features) mfcc = librosa.feature.mfcc(y=y, sr=SRATE, hop_length=HOP_SIZE, n_mfcc=N_MFCC) # Compute Constant-Q Transform cqt = librosa.logamplitude(librosa.cqt(y, sr=SRATE, hop_length=HOP_SIZE, n_bins=CQT_BINS) ** 2, ref_power=np.max) # Compute chromagram y_harmonic, y_percussive = librosa.effects.hpss(y) chroma = librosa.feature.chroma_cqt(y=y_harmonic, sr=SRATE, hop_length=HOP_SIZE) # Synchronize features to onsets mfcc_sync = librosa.feature.sync( mfcc, librosa.time_to_frames(onset_times, sr=SRATE, hop_length=HOP_SIZE), pad=False) cqt_sync = librosa.feature.sync( cqt, librosa.time_to_frames(onset_times, sr=SRATE, hop_length=HOP_SIZE), pad=False) chroma_sync = librosa.feature.sync( chroma, librosa.time_to_frames(onset_times, sr=SRATE, hop_length=HOP_SIZE), pad=False) return y, onset_times, mfcc_sync, cqt_sync, chroma_sync
def features(filename): '''Feature-extraction for audio segmentation Arguments: filename -- str path to the input song Returns: - X -- ndarray beat-synchronous feature matrix: MFCC (mean-aggregated) Chroma (median-aggregated) Latent timbre repetition Latent chroma repetition Time index Beat index - beat_times -- array mapping of beat index => timestamp includes start and end markers (0, duration) ''' def compress_data(X, k): e_vals, e_vecs = scipy.linalg.eig(X.dot(X.T)) e_vals = np.maximum(0.0, np.real(e_vals)) e_vecs = np.real(e_vecs) idx = np.argsort(e_vals)[::-1] e_vals = e_vals[idx] e_vecs = e_vecs[:, idx] # Truncate to k dimensions if k < len(e_vals): e_vals = e_vals[:k] e_vecs = e_vecs[:, :k] # Normalize by the leading singular value of X Z = np.sqrt(e_vals.max()) if Z > 0: e_vecs = e_vecs / Z return e_vecs.T.dot(X) # Harmonic waveform def harmonify(y): D = librosa.stft(y) return librosa.istft(librosa.decompose.hpss(D)[0]) # HPSS waveforms def hpss_wav(y): H, P = librosa.decompose.hpss(librosa.stft(y)) return librosa.istft(H), librosa.istft(P) # Beats and tempo def get_beats(y): odf = librosa.onset.onset_strength(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_BEATS, n_mels=N_MELS, fmax=FMAX, aggregate=np.median) bpm, beats = librosa.beat.beat_track(onset_envelope=odf, sr=sr, hop_length=HOP_BEATS) return bpm, beats # MFCC features def get_mfcc(y): # Generate a mel-spectrogram S = librosa.feature.melspectrogram(y, sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS, fmax=FMAX).astype(np.float32) # Put on a log scale S = librosa.logamplitude(S, ref_power=S.max()) return librosa.feature.mfcc(S=S, n_mfcc=N_MFCC) # Chroma features def chroma(y): # Build the wrapper CQT = np.abs(librosa.cqt(y, sr=SR, resolution=NOTE_RES, hop_length=HOP_LENGTH, fmin=NOTE_MIN, n_bins=NOTE_NUM)) C_to_Chr = librosa.filters.cq_to_chroma(CQT.shape[0], n_chroma=N_CHROMA) return librosa.logamplitude(librosa.util.normalize(C_to_Chr.dot(CQT))) # Latent factor repetition features def repetition(X, metric='seuclidean'): R = librosa.segment.recurrence_matrix(X, k=2 * int(np.ceil(np.sqrt(X.shape[1]))), width=REP_WIDTH, metric=metric, sym=False).astype(np.float32) P = scipy.signal.medfilt2d(librosa.segment.structure_feature(R), [1, REP_FILTER]) # Discard empty rows. # This should give an equivalent SVD, but resolves some numerical instabilities. P = P[P.any(axis=1)] return compress_data(P, N_REP) print '\t[1/6] loading audio' # Load the waveform y, sr = librosa.load(filename, sr=SR) # Compute duration duration = float(len(y)) / sr print '\t[2/6] Separating harmonic and percussive signals' # Separate signals y_harm, y_perc = hpss_wav(y) print '\t[3/6] detecting beats' # Get the beats bpm, beats = get_beats(y_perc) # augment the beat boundaries with the starting point beats = np.unique(np.concatenate([ [0], beats])) B = librosa.frames_to_time(beats, sr=SR, hop_length=HOP_BEATS) beat_frames = np.unique(librosa.time_to_frames(B, sr=SR, hop_length=HOP_LENGTH)) # Stash beat times aligned to the longer hop lengths B = librosa.frames_to_time(beat_frames, sr=SR, hop_length=HOP_LENGTH) print '\t[4/6] generating MFCC' # Get the MFCCs M = get_mfcc(y) # Beat-synchronize the features M = librosa.feature.sync(M, beat_frames, aggregate=np.mean) print '\t[5/6] generating chroma' # Get the chroma from the harmonic component C = chroma(y_harm) # Beat-synchronize the features C = librosa.feature.sync(C, beat_frames, aggregate=np.median) # Time-stamp features N = np.arange(float(len(beat_frames))) # Beat-synchronous repetition features print '\t[6/6] generating structure features' R_timbre = repetition(librosa.feature.stack_memory(M)) R_chroma = repetition(librosa.feature.stack_memory(C)) # Stack it all up X = np.vstack([M, C, R_timbre, R_chroma, B, B / duration, N, N / len(beats)]) # Add on the end-of-track timestamp B = np.concatenate([B, [duration]]) return X, B
import librosa import librosa.display ############################################# # Load an example with vocals. y, sr = librosa.load('audio/Cheese_N_Pot-C_-_16_-_The_Raps_Well_Clean_Album_Version.mp3', duration=120) # And compute the spectrogram magnitude and phase S_full, phase = librosa.magphase(librosa.stft(y)) ####################################### # Plot a 5-second slice of the spectrum idx = slice(*librosa.time_to_frames([30, 35], sr=sr)) plt.figure(figsize=(12, 4)) librosa.display.specshow(librosa.amplitude_to_db(S_full[:, idx], ref=np.max), y_axis='log', x_axis='time', sr=sr) plt.colorbar() plt.tight_layout() ########################################################### # The wiggly lines above are due to the vocal component. # Our goal is to separate them from the accompanying # instrumentation. # # We'll compare frames using cosine similarity, and aggregate similar frames # by taking their (per-frequency) median value. #
# Debug? DEBUG_PLOT = False # Set some params FS = 44100 # Enforce 44.1 kHz sample rate N_FFT = 2048 HOP_LENGTH = N_FFT/2 # 50% overlap N_MFCC = 13 N_MEL = 128 DB_LOW = -250.0 # silence in dB T_CONTEXT = 3 # seconds of context for our features N_FRAME_CONTEXT = librosa.time_to_frames( T_CONTEXT, sr=FS, hop_length=HOP_LENGTH, n_fft=N_FFT )[0]+1 # 64 frames on either side, for context BOUNDARY_KERNEL = signal.gaussian(N_FRAME_CONTEXT, std=32) # For smoothing our y #BOUNDARY_KERNEL = np.ones(N_FRAME_CONTEXT) DTYPE = 'float32' # FOR USE ON AMAZON EC2 AFTER COPYING FROM S3 #DATADIR = os.path.abspath(os.path.join('/mnt','audio')) #SALAMIDIR = os.path.abspath(os.path.join('/mnt','salami', 'salami-data-public')) # FOR USE ON WINDOWS MACHINE DATADIR = os.path.abspath('F:\salami-audio') SALAMIDIR = os.path.abspath('F:\salami-data-public')
def predict(self, filename=None, y=None, sr=None, outputs=None): '''Chord prediction Parameters ---------- filename : str Path to the audio file to analyze y, sr : np.ndarray, number>0 Audio signal in memory to analyze outputs : dict `{str: np.ndarray}` Pre-computed model outputs, as given by ``ChordModel.outputs``. .. note:: At least one of `filename`, `y, sr`, or `outputs` must be provided. Returns ------- jams.Annotation, namespace='chord' The chord estimate for the given signal. Examples -------- >>> import crema >>> import librosa >>> model = crema.models.chord.ChordModel() >>> chord_est = model.predict(filename=librosa.util.example_audio_file()) >>> chord_est <Annotation(namespace='chord', time=0, duration=61.4, annotation_metadata=<AnnotationMetadata(...)>, data=<45 observations>, sandbox=<Sandbox(...)>)> >>> chord_est.to_dataframe().head(5) time duration value confidence 0 0.000000 0.092880 E:maj 0.336977 1 0.092880 0.464399 E:7 0.324255 2 0.557279 1.021678 E:min 0.448759 3 1.578957 2.693515 E:maj 0.501462 4 4.272472 1.486077 E:min 0.287264 ''' if outputs is None: outputs = self.outputs(filename=filename, y=y, sr=sr) output_key = self.model.output_names[0] pump_op = self.pump[output_key] ann = super(ChordModel, self).predict(y=y, sr=sr, filename=filename, outputs=outputs) bass_pred = outputs['chord_bass'] # Handle inversion estimation for obs in ann.pop_data(): start, end = time_to_frames([obs.time, obs.time + obs.duration], sr=pump_op.sr, hop_length=pump_op.hop_length) value = obs.value if obs.value not in ('N', 'X'): mean_bass = gmean(bass_pred[start:end+1]) bass_pc = np.argmax(mean_bass) root_pc, pitches, _ = mir_eval.chord.encode(obs.value) bass_rel = 0 if bass_pc < 12: bass_rel = np.mod(bass_pc - root_pc, 12) if bass_rel and pitches[bass_rel]: value = '{}/{}'.format(value, SEMITONE_TO_SCALE_DEGREE[bass_rel]) ann.append(time=obs.time, duration=obs.duration, value=value, confidence=obs.confidence) return ann
def quantize_track(music, sr): quantization = np.arange(0, len(music)/float(sr), .1) quantization = [int(b) for b in librosa.time_to_frames(quantization)] return quantization
def serialize_song( sid, path, datadir=DATADIR, salamidir=SALAMIDIR, outputdir=OUTPUTDIR, prefix='data' ): """ serialize_data_chunk() Serializes a chunk of data on disk, given SIDs and corresponding paths. Arguments: sids : the SIDs (int list) paths : paths to sids audio files (string list) datadir : where the audio files are stored salamidir : i.e. the salami-data-public dir from a cloned SALAMI repo outputdir : for serialized data on disk prefix : prefix for serialized data file on disk Outputs: X_path : string paths to the serialized files X_shape : shape of data serialized in X_path y_path : string paths to the serialized files y_shape : shape of data serialized in y_path """ X, y = None, None X_path, X_shape, y_path, y_shape = None, None, None, None X_shape = [0, 1, N_MEL, N_FRAME_CONTEXT] y_shape = [0, 1] print "SID: {0},\tfile: {1}".format(sid, path) y_path = os.path.abspath( os.path.join(outputdir, prefix + str(sid) + '_y') ) X_path = os.path.abspath( os.path.join(outputdir, prefix + str(sid) + '_X') ) # Get the annotated segment times (sec) times = ev.id2segtimes( sid, ann_type="uppercase", salamipath=salamidir ) times_frames = librosa.time_to_frames( times, sr=FS, hop_length=HOP_LENGTH, n_fft=N_FFT ) # Get signal sig, fs = librosa.load( os.path.join(datadir, path), FS ) # Get feature frames sig_feat = librosa.feature.melspectrogram( y=sig, sr=fs, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MEL, fmax=1600 ) sig_feat = 20.0*np.log10(np.clip( sig_feat, a_min=1e-12, a_max=None)) # convert to dB sig_feat = sig_feat - np.max(sig_feat) # Normalize to 0dB sig_feat[sig_feat==-np.inf] = DB_LOW # screen out inf # Keep track of the number of frames for this song n_frames = sig_feat.shape[1] y_shape[0] = n_frames # increment the shape of our final output y data X_shape[0] = n_frames # increment the shape of our final output y data # Pad the frames, so we can have frames centered at the very start and # end of the song. sig_feat = np.hstack(( np.ones((N_MEL, N_FRAME_CONTEXT/2)) * DB_LOW, sig_feat, np.ones((N_MEL, N_FRAME_CONTEXT/2)) * DB_LOW )) # Generate the boundary indicator y = np.memmap( y_path, dtype=DTYPE, mode='w+', shape=tuple(y_shape) ) y[:] = np.zeros((n_frames,1))[:] # start with zeros y[np.minimum(times_frames,n_frames-1),0] = 1.0 if(DEBUG_PLOT): plt.figure(figsize=(10, 3)) plt.plot( y, label="Annotations" ) # Smooth y with the gaussian kernel y[:,0] = np.convolve( y[:,0], BOUNDARY_KERNEL, 'same') y[:,0] = np.minimum(y[:,0],1.0) # nothing above 1 if(DEBUG_PLOT): plt.plot( y, label="Smoothed" ) plt.xlabel("Frame number") plt.ylabel("Segment boundary strength") plt.legend() # plt.colorbar() plt.savefig('./seg.pdf', bbox_inches='tight') # plt.show() # Generate the training data X = np.memmap( X_path, dtype=DTYPE, mode='w+', shape=tuple(X_shape) ) for i_frame in xrange(n_frames): X[i_frame,0] = sig_feat[:,i_frame:i_frame+N_FRAME_CONTEXT] # debug plot if(DEBUG_PLOT): plt.figure() plt.subplot(211) plt.imshow(X[X.shape[0]/2,0]) plt.colorbar() plt.subplot(212) plt.plot(y) plt.show() # Flush our binary data to file X.flush() y.flush() return X_path, X_shape, y_path, y_shape
import librosa import librosa.display ####################################################################### # We'll use a track that has harmonic, melodic, and percussive elements y, sr = librosa.load('audio/Karissa_Hobbs_-_09_-_Lets_Go_Fishin.mp3') ####################################### # First, let's plot the original chroma chroma_orig = librosa.feature.chroma_cqt(y=y, sr=sr) # For display purposes, let's zoom in on a 15-second chunk from the middle of the song idx = [slice(None), slice(*list(librosa.time_to_frames([45, 60])))] # And for comparison, we'll show the CQT matrix as well. C = np.abs(librosa.cqt(y=y, sr=sr, bins_per_octave=12*3, n_bins=7*12*3)) plt.figure(figsize=(12, 4)) plt.subplot(2, 1, 1) librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max)[idx], y_axis='cqt_note', bins_per_octave=12*3) plt.colorbar() plt.subplot(2, 1, 2) librosa.display.specshow(chroma_orig[idx], y_axis='chroma') plt.colorbar() plt.ylabel('Original') plt.tight_layout()
def analyze_features(input_file, features=None, analysis=None, PARAMETERS=None): '''Mid-level feature analysis''' with open(input_file, 'r') as f: lowlevel = pickle.load(f) if analysis is None: analysis = {} if features is None: features = set(get_feature_names()) # Beats might occur after the last hop # We'll clip anything that's too big beat_frames = librosa.time_to_frames(lowlevel['beat_times'], sr=lowlevel['PARAMETERS']['load']['sr'], hop_length=lowlevel['PARAMETERS']['stft']['hop_length']) beat_frames = np.clip(beat_frames, 0, lowlevel['mfcc'].shape[1]-1) # Pad on a phantom 0 here beat_frames = np.unique(np.concatenate([[0], beat_frames])) analysis['beat_times'] = librosa.frames_to_time(beat_frames, sr=lowlevel['PARAMETERS']['load']['sr'], hop_length=lowlevel['PARAMETERS']['stft']['hop_length']) # Compute beat-sync features if 'beat_sync' in features: (analysis['beat_sync_mfcc'], analysis['beat_sync_mel_spectrogram'], analysis['beat_sync_cqt'], analysis['beat_sync_chroma']) = get_sync_features(lowlevel, beat_frames) onset_frames = librosa.time_to_frames(lowlevel['onsets'], sr=lowlevel['PARAMETERS']['load']['sr'], hop_length=lowlevel['PARAMETERS']['stft']['hop_length']) onset_frames = np.clip(onset_frames, 0, lowlevel['mfcc'].shape[1]-1) onset_frames = np.unique(np.concatenate([[0], onset_frames])) analysis['onset_times'] = librosa.frames_to_time(onset_frames, sr=lowlevel['PARAMETERS']['load']['sr'], hop_length=lowlevel['PARAMETERS']['stft']['hop_length']) # Compute onset-sync features if 'onset_sync' in features: (analysis['onset_sync_mfcc'], analysis['onset_sync_mel_spectrogram'], analysis['onset_sync_cqt'], analysis['onset_sync_chroma']) = get_sync_features(lowlevel, onset_frames) if 'repetition_mfcc' in features: analysis['repetition_mfcc'] = get_repetition_features(analysis['beat_sync_mfcc'], PARAMETERS['repetition']['mfcc']['n_history'], PARAMETERS['repetition']['mfcc']['metric'], PARAMETERS['repetition']['mfcc']['width'], PARAMETERS['repetition']['mfcc']['kernel_size'], PARAMETERS['repetition']['mfcc']['n_factors']) if 'repetition_chroma' in features: analysis['repetition_chroma'] = get_repetition_features(analysis['beat_sync_chroma'], PARAMETERS['repetition']['chroma']['n_history'], PARAMETERS['repetition']['chroma']['metric'], PARAMETERS['repetition']['chroma']['width'], PARAMETERS['repetition']['chroma']['kernel_size'], PARAMETERS['repetition']['chroma']['n_factors']) if 'beat_neighbors' in features: analysis['mfcc_neighbors_beat'] = get_neighbors(analysis['beat_sync_mfcc'], PARAMETERS['beat_neighbors']['k'], PARAMETERS['repetition']['mfcc']['width'], PARAMETERS['repetition']['mfcc']['metric']) analysis['chroma_neighbors_beat'] = get_neighbors(analysis['beat_sync_chroma'], PARAMETERS['beat_neighbors']['k'], PARAMETERS['repetition']['chroma']['width'], PARAMETERS['repetition']['chroma']['metric']) if 'segments' in features: # Get the min and max number of segments k_min, k_max = get_segment_range(lowlevel['duration'], PARAMETERS['segments']['min_seg'], PARAMETERS['segments']['max_seg']) # Build the feature stack X_segment = get_segment_features(analysis, lowlevel, PARAMETERS['segments']['transformation']) # Get the segment boundaries for each k in the range segment_boundaries, analysis['segments_best'] = get_segments(X_segment, k_min, k_max) # Convert back to boundary times analysis['segment_time_tree'] = [] analysis['segment_beat_tree'] = [] # Pad the beat times so that we include all points of aggregation beat_times = np.unique(np.concatenate([analysis['beat_times'], [lowlevel['duration']]])) for level, bounds in enumerate(segment_boundaries): analysis['segment_beat_tree'].append(bounds) analysis['segment_time_tree'].append(beat_times[bounds]) # Just to make it easy, copy over the best segmentation analysis['segment_times'] = analysis['segment_time_tree'][analysis['segments_best']] if 'vq' in features: # Load the transformer whitener, encoder, args = encoder_model(PARAMETERS['encoder']['transformation'], PARAMETERS['encoder']['n_quantizers']) lmdeltas = delta_features(lowlevel) analysis['frame_vq'] = encode_features(lmdeltas, whitener, encoder) analysis['vq_parameters'] = args dense_code = analysis['frame_vq'].toarray().astype(np.float32) analysis['onset_sync_vq'] = librosa.feature.sync(dense_code, onset_frames).astype(np.float32) analysis['beat_sync_vq'] = librosa.feature.sync(dense_code, beat_frames).astype(np.float32) analysis['track_vq'] = np.mean(dense_code, axis=1).astype(np.float32) # Construct a dense representation for summarization purposes PREV = analysis.get('PREVIOUS', {}) if 'computed_features' in analysis: PREV['computed_features'] = analysis['computed_features'] analysis['computed_features'] = features if 'PARAMETERS' in analysis: analysis['PREVIOUS'] = {'PARAMETERS': analysis['PARAMETERS'], 'ENVIRONMENT': analysis['ENVIRONMENT'], 'PREVIOUS': PREV} # We're done with harmonics now analysis['PARAMETERS'] = PARAMETERS analysis['ENVIRONMENT'] = ENVIRONMENT return analysis
w_p_ratio = 4 #WHERE DO THESE THRESHOLD VALUES COME FROM? beat_threshold = 15 period_threshold = 20 peak_window = 0.13 #BENCHMARK with open('assets/phrase_intervals.json') as data_file: data = json.load(data_file) bench = [] for p in data[file_id]: if p <= sample_duration: bench.append(p) bench = librosa.time_to_frames(bench,hop_length=256) #TEST CONSTANTS if (w>r): sys.exit('Window Resolution Mismatch') #LOAD WAVEFORM audio_path = 'assets/'+file_id+'.wav' y, sr = (librosa.load(audio_path, sr=None, duration=sample_duration)) w_f = librosa.time_to_frames([w],hop_length=256)[0] f = extract_features(y) #Get Beats y_harmonic, y_percussive = librosa.effects.hpss(y) tempo, beats = librosa.beat.beat_track(y=y_percussive, sr=sr)