def __test(y, top_db, ref, trim_duration): yt, idx = librosa.effects.trim(y, top_db=top_db, ref=ref) # Test for index position fidx = [slice(None)] * y.ndim fidx[-1] = slice(*idx.tolist()) assert np.allclose(yt, y[tuple(fidx)]) # Verify logamp rms = librosa.feature.rmse(y=librosa.to_mono(yt), center=False) logamp = librosa.power_to_db(rms**2, ref=ref, top_db=None) assert np.all(logamp > - top_db) # Verify logamp rms_all = librosa.feature.rmse(y=librosa.to_mono(y)).squeeze() logamp_all = librosa.power_to_db(rms_all**2, ref=ref, top_db=None) start = int(librosa.samples_to_frames(idx[0])) stop = int(librosa.samples_to_frames(idx[1])) assert np.all(logamp_all[:start] <= - top_db) assert np.all(logamp_all[stop:] <= - top_db) # Verify duration duration = librosa.get_duration(yt) assert np.allclose(duration, trim_duration, atol=1e-1), duration
def __test(y, top_db, ref, trim_duration): yt, idx = librosa.effects.trim(y, top_db=top_db, ref=ref) # Test for index position fidx = [slice(None)] * y.ndim fidx[-1] = slice(*idx.tolist()) assert np.allclose(yt, y[tuple(fidx)]) # Verify logamp rms = librosa.feature.rms(y=librosa.to_mono(yt), center=False) logamp = librosa.power_to_db(rms**2, ref=ref, top_db=None) assert np.all(logamp > - top_db) # Verify logamp rms_all = librosa.feature.rms(y=librosa.to_mono(y)).squeeze() logamp_all = librosa.power_to_db(rms_all**2, ref=ref, top_db=None) start = int(librosa.samples_to_frames(idx[0])) stop = int(librosa.samples_to_frames(idx[1])) assert np.all(logamp_all[:start] <= - top_db) assert np.all(logamp_all[stop:] <= - top_db) # Verify duration duration = librosa.get_duration(yt) assert np.allclose(duration, trim_duration, atol=1e-1), duration
def _slice_audio_by_interval(y: np.ndarray, sr: float, hop_length: int = 512, segmentation_interval_s: float = 1.0, **_kwargs) -> Tuple[np.ndarray, np.ndarray]: interval_samples: int = librosa.time_to_samples( segmentation_interval_s, sr=sr) total_samples: int = y.size # y is monophonic num_segments: int = np.ceil(total_samples / interval_samples) onset_samples: np.ndarray = interval_samples * np.arange(num_segments) onset_frames: np.ndarray = librosa.samples_to_frames( onset_samples, hop_length=hop_length) duration_samples: np.ndarray = interval_samples * np.ones_like( onset_frames) # adjust duration of last fragment to end of file remainder = total_samples % interval_samples if remainder == 0: # `total_samples` is divisible by `interval_samples`: ceil operation above was not needed pass else: # `total_samples` is not divisible by `interval_samples`: last slice is shorter duration_samples[-1] = remainder duration_frames: np.ndarray = librosa.samples_to_frames( duration_samples, hop_length=hop_length) return onset_frames, duration_frames
def gen_hihat(all_data, fs, fps, cand): fps = librosa.samples_to_frames(fs, hop_length=hop_len, n_fft=win_len) fps = 100 print(cand) proc = BeatTrackingProcessor(look_aside=0.2, fps=fps) act = RNNBeatProcessor()(all_data) beat_times = proc(act) song_len = librosa.samples_to_time(data.shape, sr=fs)[0] hihat = np.zeros(all_data.shape) idx = np.where(beat_times <= song_len)[0] new_beat_times = np.zeros(idx.shape) new_beat_times[idx] = beat_times[idx] beat_samples = librosa.time_to_samples(new_beat_times, sr=fs) start = librosa.frames_to_samples(cand[0], hop_len, n_fft=win_len) end = librosa.frames_to_samples(cand[-1], hop_len, n_fft=win_len) cand_len = end - start i = 3 is_hihat = np.zeros(beat_samples.shape) while i < len(beat_samples): is_hihat[i] = 1 i = i + 4 for i, s in enumerate(beat_samples): if is_hihat[i] == 1: if s + cand_len > hihat.shape: break hihat[s:s + cand_len] = data[start:end] return hihat, new_beat_times, beat_samples
def get_frame(self) -> int: if Beat.INDEX_VALUE == 'samples': return librosa.samples_to_frames(self.index, hop_length=util.HOP_LENGTH) elif Beat.INDEX_VALUE == 'time': return librosa.time_to_frames(self.index, sr=util.SAMPLE_RATE, hop_length=util.HOP_LENGTH) else: raise NotImplementedError("Only samples and time are supported")
def trackBeatsPer16thNote(x, bpm, sr=22050, hop_length=512, offset_16th_notes=0): """ clickで書き出すと16分音符毎にクリック音が鳴らせるようにビートトラッキングする 16分音符毎にインデックスが割り当てられている offset_16th_notes(最初の16分音符の数)でアウフタクトの除去が可能 """ tempo, beat_samples = librosa.beat.beat_track(x, sr=sr, hop_length=hop_length, start_bpm=bpm, units='samples') beat_frames_per_16th_note = [] for i in range(len(beat_samples) - 1): interval_per_16th_units = librosa.samples_to_frames( np.linspace(beat_samples[i], beat_samples[i + 1], 5), hop_length=hop_length) print(interval_per_16th_units[0:4]) beat_frames_per_16th_note = np.hstack( (beat_frames_per_16th_note, interval_per_16th_units[0:4])) if offset_16th_notes > 0: beat_frames_per_16th_note = beat_frames_per_16th_note[ offset_16th_notes:] return beat_frames_per_16th_note.astype(np.int)
def apply_trim_offset(self, frame): return ( librosa.samples_to_frames( librosa.frames_to_samples(frame) + self.trim_offset ) if self.trim_offset else frame )
def __test(x, y, hop_length, n_fft): y_test = librosa.samples_to_frames(x, hop_length=hop_length, n_fft=n_fft) assert np.allclose(y, y_test) y = np.asanyarray(y) assert y.shape == y_test.shape assert y.ndim == y_test.ndim
def convert_sample_to_nframes(y_sample_start, y_sample_end, **stft_args): n_fft = 2048 if 'n_fft' in stft_args: n_fft = stft_args['n_fft'] hop_length = n_fft//4 if 'hop_length' in stft_args: hop_length = stft_args['hop_length'] return lr.samples_to_frames(np.array([y_sample_start, y_sample_end]), hop_length=hop_length, n_fft=n_fft)
def get_downbeats(y, tempo, beat_frames, sr): measures = len(beat_frames) // BEATS beat_frames = librosa.samples_to_frames(beat_frames) onset_env = librosa.onset.onset_strength(y, sr=sr, aggregate=np.median) beat_strengths = onset_env[beat_frames] measure_beat_strengths = beat_strengths[:measures * BEATS].reshape( -1, BEATS) beat_pos_strength = np.sum(measure_beat_strengths, axis=0) downbeat_pos = np.argmax(beat_pos_strength) full_measure_beats = beat_frames[:measures * BEATS].reshape(-1, BEATS) downbeat_frames = full_measure_beats[:, downbeat_pos] return librosa.frames_to_samples(downbeat_frames)
def test_samples_to_frames(samples, hop_length, n_fft): frames = librosa.samples_to_frames(samples, hop_length=hop_length, n_fft=n_fft) samples = np.asanyarray(samples) assert frames.shape == samples.shape assert frames.ndim == samples.ndim if n_fft is None: assert np.allclose(samples, frames * hop_length) else: assert np.allclose((samples - n_fft // 2) // hop_length, frames)
def read_data(): os.chdir(DATA_PATH) os.chdir('Gitarre monophon/Samples/Distortion') train_data = [] train_labels = [] for file_name in os.listdir(os.getcwd()): if file_name.endswith(".wav"): print(file_name) # Labeling the sample with one hot encoding label_no = int(file_name[13]) # Effect setting is the label label = np.zeros([3]) label[label_no - 1] = 1 train_labels.append(label) # Loading the audio y, sr = librosa.load(file_name, sr=44100) # Onset Detection y = np.insert(y, 0, np.zeros(1023)) y = librosa.util.normalize(y) onset_frame = librosa.onset.onset_detect(y=y, sr=sr, units='frames', pre_max=20000, post_max=20000, pre_avg=20000, post_avg=20000, delta=0, wait=1000) offset_frame = librosa.samples_to_frames(samples=y.shape[0]) onset_sample = librosa.core.frames_to_samples(onset_frame[0]) offset_sample = librosa.core.frames_to_samples(offset_frame) y_cut = y[onset_sample:offset_sample] mfcc = librosa.feature.mfcc(y=y_cut, sr=sr, n_mfcc=2) mfcc_delta = librosa.feature.delta(mfcc) m_features = np.concatenate((mfcc, mfcc_delta)) v_features = [] for feat in m_features: lin_coeff, lin_residual, _, _, _ = np.polyfit(np.arange( len(feat)), feat, 1, full=True) v_features.extend(lin_coeff) # v_features.append(lin_residual) train_data.append(np.hstack(v_features)) train_data = np.array(train_data) train_labels = np.array(train_labels) return train_data, train_labels
def forward(self, signals, lengths): mel_features = self.mfcc(signals) if self.remove_zeroth_coef: mel_features = mel_features[:, 1:, :] device = lengths.device lengths_frames = librosa.samples_to_frames(lengths.cpu().numpy(), hop_length=self.hop_length, n_fft=self.n_fft) lengths_frames = torch.Tensor(lengths_frames).to(device).int() if self.use_deltas: delta = self.deltas(mel_features) delta2 = self.deltas(delta) mel_features = torch.cat((mel_features, delta, delta2), dim=-2) if self.normalize_features: mel_features = self.norm(mel_features) return mel_features, lengths_frames
def _compute_slice_durations(y: np.ndarray, sr: float, hop_length: float, onsets: np.ndarray, min_size_s: Optional[float] = None, max_size_s: Optional[float] = None, off_threshold_db: Optional[float] = None, discard_by_mean: bool = True, **_kwargs) -> Tuple[np.ndarray, np.ndarray]: """ y: mono signal [shape: (n,)] onsets: onset frames """ rms_frames_db = 20 * np.log10( np.abs(librosa.feature.rms(y=y, hop_length=hop_length)) + librosa.util.tiny(y)).reshape(-1) eof = librosa.samples_to_frames(y.size, hop_length=hop_length) durations = np.diff(np.block([onsets, eof])) if max_size_s is not None: max_size_frames = librosa.time_to_frames(max_size_s, sr=sr, hop_length=hop_length) durations[durations > max_size_frames] = max_size_frames if off_threshold_db is not None: for i in range(onsets.size): segment_rms = rms_frames_db[onsets[i]:onsets[i] + durations[i]] first_silent_frame = np.argmax(segment_rms < off_threshold_db) # Only discard part of segment if mean of entire part to be discarded is below threshold if discard_by_mean: if np.mean(segment_rms[first_silent_frame:] ) < off_threshold_db: durations[i] = first_silent_frame # Discard part of segment starting from frame below threshold else: # `np.argmax(a < v)` will by default return 0 if it doesn't find any matches: # therefore the check that the condition indeed is fulfilled. durations[i] = first_silent_frame if segment_rms[first_silent_frame] < off_threshold_db else \ durations[i] if min_size_s is not None: valid_frames_mask = durations > min_size_s onsets = onsets[valid_frames_mask] durations = durations[valid_frames_mask] return onsets, durations
def retrieve_components(self, selection_order=None): if selection_order is None: return self.spectrogram if len(selection_order) > 0: max_val = max(selection_order) if max_val >= self.get_number_components(): raise ValueError("{} out of bounds for {} components", max_val, self.get_number_components()) mask = torch.zeros_like(self.spectrogram) unmask = torch.ones_like(self.spectrogram) # following the order of segments in [Mishra 2017] Figure 4 temp_length = mask.shape[1] // len(self.temporal_segments) freq_length = mask.shape[0] // self.n_frequency_segments left_over = mask.shape[1] - temp_length * len(self.temporal_segments) if left_over > 0: warnings.warn( "Adding last {} frames to last segment".format(left_over)) def compute_f_start(f): return f * freq_length def compute_f_end(f): return compute_f_start(f) + freq_length for so in selection_order: t = so // self.n_frequency_segments # index of temporal_segment # print("t", t) f = so % self.n_frequency_segments [t_start, t_end] = librosa.samples_to_frames(self.temporal_segments[t], hop_length=self.hop_length) if t == len(self.temporal_segments) - 1: t_end = mask.shape[1] # print("t_start {}, t_end{}".format(t_start, t_end)) f_start = compute_f_start(f) f_end = compute_f_end(f) mask[f_start:f_end, t_start:t_end] = 1. unmask[f_start:f_end, t_start:t_end] = 0. return self.spectrogram * mask + self.baseline * unmask
def length_convert(length: float, sr: int, units_def: LengthUnit, units_target: LengthUnit, hop_length: int = 512) -> float: """Convert length from one unit to another. Parameters ---------- length : float in sec sr : int units_def : LengthUnit Units that are passed units_target : LengthUnit Units that are expected hop_length : int, optional 512 by default, mandatory for frames conversion Returns ------- float """ if units_def == LengthUnit.samples: if units_target == LengthUnit.frames: return lr.samples_to_frames(length, hop_length) # type:ignore if units_target == LengthUnit.ms: return lr.samples_to_time(length, sr) # type:ignore return length if units_def == LengthUnit.ms: if units_target == LengthUnit.samples: return lr.time_to_samples(length, sr) # type:ignore if units_target == LengthUnit.frames: return lr.time_to_frames(length, sr, hop_length) # type:ignore return length if units_def == LengthUnit.frames: if units_target == LengthUnit.samples: return lr.frames_to_samples(length, hop_length) # type:ignore if units_target == LengthUnit.ms: return lr.frames_to_time(length, sr, hop_length) # type:ignore return length raise TypeError(f'not a LengthUnit: {units_def, units_target}')
def process_damp_data(artist_tracks_file): sys.path.append('../') import damp_config damp_data_dir = damp_config.vocal_audio_dir musdb_data_dir = damp_config.bg_audio_dir # musdb_data_dir = 'background_tracks' if not os.path.exists('damp_mashup_output'): os.makedirs('damp_mashup_output') train_dict = pickle.load(open(artist_tracks_file, 'rb')) vocal_paths = [] for artist_id, track_list in train_dict.items(): for track_id in track_list: vocal_track_path = os.path.join(damp_data_dir, track_id + '.m4a') mashability_result = find_mashup_pairs(vocal_track_path, musdb_data_dir) for start_sample, (bg_track, (bg_start_sample, bg_key, bg_ismajor)) in mashability_result.items(): print(start_sample, bg_track, bg_start_sample) mixed_output = mash(vocal_track_path, start_sample, bg_track, bg_start_sample, 3.0) start_frame = librosa.samples_to_frames( start_sample, hop_length=damp_config.hop_length, n_fft=damp_config.n_fft) # librosa.output.write_wav(os.path.join(config.mix_audio_dir, Path(vocal_path).stem + '_' + str(start_frame) +'.wav'), mixed_output, sr=AUDIO_PARAMS['sr']) print(track_id, start_sample / 44100, start_frame) soundfile.write(os.path.join( config.mix_audio_dir, track_id + '_' + str(start_frame) + '.wav'), mixed_output, AUDIO_PARAMS['sr'], format='WAV')
def main_autoedit(args, **kwargs): """main_autoedit Complete autoedit flow ..todo:: - loop over chunks of input, batch is large single chunk - handle returned chunk data, integrate over time - chunk parallel processing (entire graph) vs. chunk serial processing (entire graph) - nodes with memory and nodes without - graph class - populate 'func' in graph w/ cached/non-cached funcs - step file / stream input: deal with chunking and collecting, refuse to work on files > maxlen (configurable) - openl3 """ # convert args to dict # kwargs = args_to_dict(args) # convert arguments to locals, TODO: config file for autoedit param dict # sr_comp = kwargs['sr_comp'] # 22050 # numsegs = kwargs['numsegs'] # 10 # duration = kwargs['duration'] # 10 # verbose = kwargs['verbose'] # seglen_min = time_to_frames(kwargs['seglen_min']) # seglen_max = time_to_frames(kwargs['seglen_max']) args = autoedit_args_check(args) seglen_min = time_to_frames(args.seglen_min) seglen_max = time_to_frames(args.seglen_max) timebase = "frames" spacer = '\n ' # caching # compute_music_extractor_essentia_cached = memory.cache(compute_music_extractor_essentia) # computation graph g g = OrderedDict() # populate graph with functions g['func'] = {} for func in [ compute_beats_librosa, compute_chroma_librosa, compute_onsets_librosa, compute_segments_essentia, compute_segments_librosa, data_load_essentia, data_load_librosa, ]: g['func'][func] = memory.cache(func) for func in [ compute_event_merge_combined, track_assemble_from_segments, track_assemble_from_segments_sequential_scale, ]: g['func'][func] = func # layer 1: file data g['l1_files'] = OrderedDict() for filename in args.filenames: # replace with basename filename_short = filename.split('/')[-1] if args.verbose: print(('main_autoedit{1}filename_short: {0}'.format( filename_short, spacer))) # files[filename_short] = compute_tempo_beats(filename) # load data # y, sr = data_load_essentia_cached(filename) # compute beatiness on data g['l1_files'][filename_short] = {} tmp_ = g['func'][data_load_essentia](filename, sr=args.sr_comp) # tmp_ = g['func'][data_load_librosa](filename, sr=args.sr_comp) g['l1_files'][filename_short]['data'] = tmp_[0] g['l1_files'][filename_short]['numsamples'] = len(tmp_[0]) g['l1_files'][filename_short]['numframes'] = samples_to_frames( len(tmp_[0])) g['l1_files'][filename_short]['sr'] = tmp_[1] if args.verbose: print( 'main_autoedit{5}loaded {0} with shape {1}, numsamples {2}, numframes {3}, sr {4}' .format(filename_short, g['l1_files'][filename_short]['data'].shape, g['l1_files'][filename_short]['numsamples'], g['l1_files'][filename_short]['numframes'], g['l1_files'][filename_short]['sr'], spacer)) # layer 2: compute chromagram g['l2_chromagram'] = {} for file_ in g['l1_files']: # file_key = '{0}-{1}'.format(file_, 'chromagram') g['l2_chromagram'][file_] = {} g['l2_chromagram'][file_]['data'] = g['func'][compute_chroma_librosa]( g['l1_files'][file_]['data'], args.sr_comp)['chromagram'] # layer 3: compute segments based on chromagram g['l3_segments'] = OrderedDict() for file_ in g['l2_chromagram']: # file_key = '{0}-{1}'.format(file_, 'segments') bounds_frames = g['func'][compute_segments_essentia]( g['l2_chromagram'][file_]['data'], args.sr_comp, args.numsegs)['bounds_frames'] # print((' file_: {0}, bounds_frames {1}, {2}'.format(file_, len(bounds_frames), pformat(bounds_frames)))) g['l3_segments'][file_] = {} g['l3_segments'][file_]['seg_sbic'] = np.clip(bounds_frames, 0, [ g['l1_files'][filename_short]['numframes'] for filename_short in g['l1_files'] ][0] - 1) bounds_frames = g['func'][compute_segments_librosa]( g['l2_chromagram'][file_]['data'], args.sr_comp, args.numsegs)['bounds_frames'] # print((' file_: {0}, bounds_frames {1}, {2}'.format(file_, len(bounds_frames), pformat(bounds_frames)))) g['l3_segments'][file_]['seg_clust_1'] = bounds_frames bounds_frames = g['func'][compute_segments_librosa]( g['l2_chromagram'][file_]['data'], args.sr_comp, args.numsegs + 5)['bounds_frames'] # print((' file_: {0}, bounds_frames {1}, {2}'.format(file_, len(bounds_frames), pformat(bounds_frames)))) g['l3_segments'][file_]['seg_clust_2'] = bounds_frames # layer 4: compute onsets g['l4_onsets'] = OrderedDict() for file_ in g['l1_files']: onsets = g['func'][compute_onsets_librosa]( g['l1_files'][file_]['data'], args.sr_comp) g['l4_onsets'][file_] = onsets # layer 5: compute beats based on onsets g['l5_beats'] = OrderedDict() for file_ in g['l4_onsets']: g['l5_beats'][file_] = {} for start_bpm in [60, 90, 120]: beats = g['func'][compute_beats_librosa]( g['l4_onsets'][file_]['onsets_env'], g['l4_onsets'][file_]['onsets_frames'], start_bpm, args.sr_comp) # print('beats type = {0}'.format(type(beats['beats']))) # beats['beats'] = beats['beats'][np.logical_not(np.isnan(beats['beats']))] # beats = beats[~np.isnan(beats)] # print(' file_: {0}, bounds_frames {1}, {2}'.format(file_, len(bounds_frames), pformat(bounds_frames))) g['l5_beats'][file_]['beats_{0}'.format( start_bpm)] = beats['beats'] g['l5_beats'][file_]['beats_{0}_16'.format( start_bpm)] = beats['beats'][::16] # layer 6: compute final segments from merging segments with beats g['l6_merge'] = OrderedDict() g['l6_merge']['files'] = [] for file_ in g['l1_files']: # get basedir from filename dirname = os.path.dirname(filename) # return realpath absolute path # dirname = os.path.dirname(os.path.realpath(filename)) if dirname == '': dirname = '.' if args.verbose: print(f'main_autoedit dirname {dirname}') print( f'main_autoedit{spacer}l6_merge file_ {file_}, dirname {dirname}, filename {filename}' ) beats_keys = ['beats_60', 'beats_90', 'beats_120' ] + ['beats_60_16', 'beats_90_16', 'beats_120_16'] # beats = [g['l5_beats'][file_][beat_type] for beat_type in beats_keys for file_ in g['l1_files']] beats = [g['l5_beats'][file_][beat_type] for beat_type in beats_keys] # segs = [g['l3_segments'][file_][seg_type_] for seg_type_ in ['seg_sbic', 'seg_clust_1', 'seg_clust_2'] for file_ in g['l1_files']] segs = [ g['l3_segments'][file_][seg_type_] for seg_type_ in ['seg_sbic', 'seg_clust_1', 'seg_clust_2'] ] numframes = g['l1_files'][file_]['numframes'] # compute if args.verbose: print( f'main_autoedit{spacer}l6_merge dirname {dirname}, filename {filename}' ) files = g['func'][compute_event_merge_combined]( filename_48=dirname + '/' + file_, beats=beats, segs=segs, numframes=numframes, numsegs=args.numsegs, verbose=args.verbose, sr_comp=args.sr_comp, rootdir=args.rootdir, ) g['l6_merge']['files'].extend(files['files']) if args.verbose: print('main_autoedit{2}l6_merge {0}, {1}'.format( file_, g['l6_merge']['files'], spacer)) # layer 7: compute assembled song from segments and duration g['l7_assemble'] = OrderedDict() # compute duration g['l6_merge']['duration'] = args.duration # output filename g['l6_merge']['filename_export'] = args.filename_export # crossfade argument g['l6_merge']['assemble_crossfade'] = args.assemble_crossfade # rootdir argument g['l6_merge']['rootdir'] = args.rootdir g['l6_merge']['verbose'] = args.verbose if args.assemble_mode == 'random': g['l7_assemble']['outfile'] = g['func'][track_assemble_from_segments]( **(g['l6_merge'])) elif args.assemble_mode == 'sequential': g['l7_assemble']['outfile'] = g['func'][ track_assemble_from_segments_sequential_scale](**(g['l6_merge'])) filename_export_wav = g['l7_assemble']['outfile']['filename_export_wav'] filename_export_txt = g['l7_assemble']['outfile']['filename_export_txt'] export_duration = g['l7_assemble']['outfile']['final_duration'] export_segs = g['l7_assemble']['outfile']['seg_s'] export_numsegs = len(g['l7_assemble']['outfile']['seg_s']) if 'pkl' in args.outputs: filename_export_graph = os.path.join( args.rootdir, f'{args.filename_export}_graph.pkl') if args.verbose: print( f'main_autoedit{spacer}exporting graph to {filename_export_graph}' ) joblib.dump(g, filename_export_graph) # # plot dictionary g as graph # autoedit_graph_from_dict(g=g, plot=False) ret = { 'data': { 'output_files': [ { 'format': 'wav', 'filename': os.path.basename(filename_export_wav) }, { 'format': 'txt', 'filename': os.path.basename(filename_export_txt) }, ], 'output_length': export_duration, 'output_numsegs': export_numsegs, } } if 'pkl' in args.outputs: ret['data']['output_files'].append({ 'format': 'pkl', 'filename': filename_export_graph }) # # yeah nice, should be obsolete # ret.update(g['l7_assemble']['outfile']) filename_result = os.path.join( args.rootdir, os.path.basename(args.filename_export) + ".json") # this saves the array in .json format json.dump( ret, codecs.open(filename_result, 'w', encoding='utf-8'), # separators=(',', ':'), # sort_keys=True, # indent=4, # cls=NumpyEncoder, ) if 'task' in kwargs: kwargs['task'].set_done( result_location=os.path.basename(args.filename_export) + ".json") return ret
def find_loop_pairs(self): runtime_start = time.time() S = librosa.core.stft(y=self.audio) S_power = np.abs(S) ** 2 S_weighed = librosa.core.perceptual_weighting( S=S_power, frequencies=librosa.fft_frequencies(sr=self.rate) ) mel_spectrogram = librosa.feature.melspectrogram(S=S_weighed, sr=self.rate, n_mels=128, fmax=8000) chroma = librosa.feature.chroma_stft(S=S_power) power_db = librosa.power_to_db(S_weighed, ref=np.median) onset_env = librosa.onset.onset_strength(S=mel_spectrogram) pulse = librosa.beat.plp(onset_envelope=onset_env) beats_plp = np.flatnonzero(librosa.util.localmax(pulse)) bpm, beats = librosa.beat.beat_track(onset_envelope=onset_env) beats = np.union1d(beats, beats_plp) beats = np.sort(beats) logging.info("Detected {} beats at {:.0f} bpm".format(beats.size, bpm)) min_duration = int(chroma.shape[-1] * self.min_duration_multiplier) runtime_end = time.time() prep_time = runtime_end - runtime_start logging.info("Finished initial audio processing in {:.3}s".format(prep_time)) candidate_pairs = [] deviation = np.linalg.norm(chroma[..., beats] * 0.085, axis=0) for idx, loop_end in enumerate(beats): for loop_start in beats: if loop_end - loop_start < min_duration: break dist = np.linalg.norm(chroma[..., loop_end] - chroma[..., loop_start]) if dist <= deviation[idx]: db_diff = self.db_diff( power_db[..., loop_end], power_db[..., loop_start] ) if db_diff <= 1.5: candidate_pairs.append( { "loop_start": loop_start, "loop_end": loop_end, "dB_diff": db_diff, "dist": (dist / deviation[idx]) } ) logging.info(f"Found {len(candidate_pairs)} possible loop points") if not candidate_pairs: return candidate_pairs beats_per_second = bpm / 60 num_test_beats = 12 seconds_to_test = num_test_beats / beats_per_second test_offset = librosa.samples_to_frames(int(seconds_to_test * self.rate)) # adjust offset for very short tracks to 25% of its length if test_offset > chroma.shape[-1]: test_offset = chroma.shape[-1] // 4 candidate_pairs = self._dB_prune(candidate_pairs) weights = _geometric_weights(test_offset, start=test_offset // num_test_beats) pair_score_list = [ self._pair_score( pair["loop_start"], pair["loop_end"], chroma, test_duration=test_offset, weights=weights, ) for pair in candidate_pairs ] # Add cosine similarity as score for pair, score in zip(candidate_pairs, pair_score_list): pair["score"] = score candidate_pairs = self._score_prune(candidate_pairs) # re-sort based on new score candidate_pairs = sorted(candidate_pairs, reverse=True, key=lambda x: x["score"]) # prefer longer loops for highly similar sequences if len(candidate_pairs) > 1: self._prioritize_duration(candidate_pairs) if self.trim_offset: for pair in candidate_pairs: pair["loop_start"] = self.apply_trim_offset( pair["loop_start"] ) pair["loop_end"] = self.apply_trim_offset( pair["loop_end"] ) for pair in candidate_pairs: logging.info( "Found from {} to {}, dB_diff:{}, similarity:{}".format( pair["loop_start"], pair["loop_end"], pair["dB_diff"], pair["score"], ) ) if not candidate_pairs: raise LoopNotFoundError(f'No loop points found for {self.filename} with current parameters.') else: return candidate_pairs
def __test(x, y, hop_length, n_fft): y_test = librosa.samples_to_frames(x, hop_length=hop_length, n_fft=n_fft) assert np.allclose(y, y_test)
def main_automix(args): """main_automix Perform complete automix flow with the following schema: 1. input list of audio files / text file containing list of audio files 2. loop over files 2.1. compute bag of measures for each file: beatiness, extractor essentia, features paa 2.2. sort files by selected feature args.sort_feature 2.3. assemble output wav from concatenating input files pydub 2.4. TODO: optional: local measures 2.4. TODO: optional: complexity / information measures smp/sequence """ # convert args to dict kwargs = args_to_dict(args) print('main_automix: kwargs {0}'.format(pformat(kwargs))) # flow graph g g = OrderedDict() # cached functions g['func'] = {} for func in [ compute_beats_librosa, compute_chroma_librosa, compute_features_paa, compute_music_extractor_essentia, compute_onsets_librosa, compute_segments_essentia, compute_segments_librosa, compute_tempo_beats_essentia, data_load_essentia, ]: g['func'][func] = memory.cache(func) # uncached functions for func in [ compute_event_merge_combined, track_assemble_from_segments, ]: g['func'][func] = func # input type: text file, list of files if len(kwargs['filenames']) == 1 and kwargs['filenames'][0].endswith('.txt'): filenames = [_.rstrip() for _ in open(kwargs['filenames'][0], 'r').readlines()] # print('filenames {0}'.format(pformat(filenames))) print('filenames {0}'.format(filenames)) else: filenames = kwargs['filenames'] # layer 1: file/chunk data g['l1_files'] = OrderedDict() for i, filename in enumerate(filenames): # print('filename {0}: {1}'.format(i, filename)) filename_short = filename.split('/')[-1] print(('file: {0}'.format(filename_short))) # load data # y, sr = g['func'][data_load_essentia](filename) g['l1_files'][filename_short] = {} tmp_ = g['func'][data_load_essentia](filename) g['l1_files'][filename_short]['path'] = filename g['l1_files'][filename_short]['data'] = tmp_[0] g['l1_files'][filename_short]['numframes'] = samples_to_frames(len(tmp_[0])) g['l1_files'][filename_short]['sr'] = tmp_[1] # layer 2: beatiness, compute beatiness on data # g['l2_beatiness'] = {} for file_ in g['l1_files']: # file_key = '{0}-{1}'.format(file_, 'beatiness') # g['l2_beatiness'][file_] = {} tmp_ = g['func'][compute_tempo_beats_essentia](g['l1_files'][file_]['data']) # g['l2_beatiness'][file_] = tmp_ # g['l1_files'][file_]['beatiness'] = tmp_ g['l1_files'][file_].update(dict([('beatiness' + _, tmp_[_]) for _ in tmp_])) # layer 3: extractor # g['l3_extractor'] = {} for file_ in g['l1_files']: print('l3_extractor on {0}'.format(file_)) # file_key = '{0}-{1}'.format(file_, 'extractor') # g['l2_extractor'][file_] = {} tmp_ = g['func'][compute_music_extractor_essentia](g['l1_files'][file_]['path']) # g['l3_extractor'][file_] = tmp_ # g['l1_files'][file_]['extractor'] = tmp_ g['l1_files'][file_].update(dict([('extractor_' + _, tmp_[_]) for _ in tmp_])) # layer 4: paa features # g['l4_paa_features'] = {} for file_ in g['l1_files']: # file_key = '{0}-{1}'.format(file_, 'extractor') # g['l4_paa_features'][file_] = {} tmp_ = g['func'][compute_features_paa](g['l1_files'][file_]['path']) # g['l4_paa_features'][file_]['features_st'] = dict(zip(tmp_[1], tmp_[0])) # g['l4_paa_features'][file_]['features_mt'] = dict(zip(tmp_[1], tmp_[2])) g['l1_files'][file_].update(dict(zip(['features_st_' + _ for _ in tmp_[1]], [_.mean() for _ in tmp_[0]]))) g['l1_files'][file_].update(dict(zip(['features_mt_' + _ for _ in tmp_[1]], [_.mean() for _ in tmp_[2]]))) # g['l1_files'][file_]['features_mt'] = dict(zip(tmp_[1], tmp_[2])) # layer 5: pickle.dump(g, open('g.pkl', 'wb')) # print('files {0}'.format(pformat(files))) # plot dictionary g as graph autoedit_graph_from_dict(g=g, plot=False) l1_files_df = pd.DataFrame.from_dict(g['l1_files']).T # sort_key = 'features_mt_energy_entropy_mean' # sort_key = 'features_mt_energy_mean' # sort_key = 'features_mt_spectral_centroid_mean' # sort_key = 'features_mt_spectral_entropy_mean' # sort_key = 'features_mt_spectral_flux_mean' # sort_key = 'features_mt_spectral_rolloff_mean' # sort_key = 'features_mt_spectral_spread_mean' # sort_key = 'features_mt_zcr_mean' sort_key = kwargs['sorter'] print('Sorting l1_files by {0}'.format(l1_files_df.sort_values(sort_key, ascending=False).path.to_string())) l1_files_df.sort_values(sort_key, ascending=False).path.to_csv('automix-assembled-{0}-{1}.{2}'.format(3, sort_key, 'csv')) if args.write: track_assemble_from_segments_sequential(files=list(l1_files_df.sort_values(sort_key, ascending=False).path), output_filename='automix-assembled-{0}-{1}.{2}'.format(3, sort_key, 'wav'), duration=None)
test.extend(drum[int(0.5 * drum.shape[0]):]) test.extend(drum) test.extend(beattt) test.extend(sapce) test.extend(sapce) test.extend(drum) test.extend(sapce) test.extend(beattt) test.extend(sapce) test.extend(drum) test.extend(drum[int(0.5 * drum.shape[0]):]) sss = np.zeros(data.shape) sss[:np.array(test).shape[0]] = np.array(test) #sd.play(sss*5+data*5, fs) '''gen drum''' fps = librosa.samples_to_frames(fs, hop_length=hop_len, n_fft=win_len) fps = 100 print(fps) proc = BeatTrackingProcessor(look_aside=0.2, fps=fps) act = RNNBeatProcessor()(all_data) beat_times = proc(act) song_len = librosa.samples_to_time(data.shape, sr=fs)[0] beat = np.zeros(all_data.shape) idx = np.where(beat_times <= song_len)[0] new_beat_times = np.zeros(idx.shape) new_beat_times[idx] = beat_times[idx] beat_samples = librosa.time_to_samples(new_beat_times, sr=fs) cand_len = len(drum)
def read_data(path_folder): """Reads sample data from files and extracts features""" os.chdir(DATA_PATH) sample_paths = [ 'Gitarre monophon/Samples/NoFX', 'Gitarre polyphon/Samples/NoFX' ] train_data = [] train_labels = [] for path in sample_paths: sample_path = os.path.join(path_folder, path) os.chdir(sample_path) for file_name in os.listdir(os.getcwd()): if file_name.endswith(".wav"): print(file_name) os.chdir(Path('../../Labels')) # Label names are: Edge, Gain, Tone label_file = file_name[:-4] + '.pickle' # label = [0.0, 0.0, 0.0] with open(label_file, 'rb') as handle: label = pickle.load(handle) print(label) if path_folder == 'DlyRandomSamples': # Fix limited delay plugin range label[0] = label[0] * 4.0 label[1] = label[1] * 10.0 os.chdir('../Samples/NoFX') train_labels.append(label) # Loading the audio y, sr = librosa.load(file_name, sr=44100) # Onset Detection y = np.insert(y, 0, np.zeros(1023)) y = librosa.util.normalize(y) onset_frame = librosa.onset.onset_detect(y=y, sr=sr, units='frames', pre_max=20000, post_max=20000, pre_avg=20000, post_avg=20000, delta=0, wait=1000) offset_frame = librosa.samples_to_frames(samples=y.shape[0]) onset_sample = librosa.core.frames_to_samples(onset_frame[0]) offset_sample = librosa.core.frames_to_samples(offset_frame) y_cut = y[onset_sample:offset_sample] v_features = [] if path_folder == 'DistRandomSamples': v_features = get_dist_feat(y_cut=y_cut, sr=sr) elif path_folder == 'TremRandomSamples': v_features = get_trem_feat(y_cut=y_cut, sr=sr) elif path_folder == 'DlyRandomSamples': v_features = get_dly_feat(y_cut=y_cut, sr=sr, y=y) else: print('Sample folder for feature extraction not found') train_data.append(np.hstack(v_features)) os.chdir(DATA_PATH) train_data = np.array(train_data) print(train_data.shape) scaler = preprocessing.StandardScaler() train_data = scaler.fit_transform(train_data) train_labels = np.array(train_labels) os.chdir(DATA_PATH) return train_data, train_labels
def modify_classical(level, param_dict, start, dur=4, sig_dur=4, segment=False): print "Classical modification begun.." # snap to segment or start marker if segment: # use pre-computed segment boundaries start_bounds = param_dict['bounds'][:, 0] nearest_bound = start_bounds[np.where(start_bounds >= start)][0] else: # simply use start marker nearest_bound = start # level 1 - tempo change -- volume envelope needs fixing!! if level == 1: offset = 0.8 # in frames, conversion to samples required tempo_curve = param_dict['tempo'] nearest_bound_in_frame = librosa.samples_to_frames([nearest_bound])[0] tempo_factor = tempo_curve[nearest_bound_in_frame] # change dur to account for tempo factor dur = int(np.ceil(dur * (tempo_factor + offset))) clip = gs.audio_buffer[nearest_bound:nearest_bound + (dur * gs.sr)] shrink = librosa.effects.time_stretch(clip, offset + tempo_factor) # normalizing CRAP. librosa.output.write_wav("clip.wav", clip, gs.sr) as_clip = pydub.AudioSegment.from_wav("clip.wav") as_amp = as_clip.dBFS librosa.output.write_wav("shrink.wav", shrink, gs.sr) shrink = match_target_amplitude( pydub.AudioSegment.from_wav("shrink.wav"), as_amp) shrink.export("new_shrink.wav", format="wav") shrink, sr = librosa.load("new_shrink.wav") compensate_factor = 1.2 remainder = np.concatenate( (shrink, gs.audio_buffer[nearest_bound + (dur * gs.sr):])) gs.audio_buffer[nearest_bound:nearest_bound + len(remainder)] = remainder gs.audio_buffer[-1 * (len(clip) - len(shrink)):] = 0 # taper_buffer_edges(nearest_bound, nearest_bound + len(shrink), 1.0) # if stretch instead of shrink # gs.audio_buffer = np.concatenate((gs.audio_buffer, np.zeros(len(stretch) - len(clip)))) # gs.audio_buffer[nearest_bound:] = np.concatenate( (window(stretch), window(gs.audio_buffer[nearest_bound + (dur*gs.sr):])) ) # level 0 - echo with delay elif level == 0: offset = int(0.75 * gs.sr) clip = gs.audio_buffer[nearest_bound:nearest_bound + (dur * gs.sr)] echo_amp_curve = param_dict['echo'] if echo_amp_curve != None: echo_amp = echo_amp_curve[nearest_bound] else: echo_amp = 0.8 delay_curve = param_dict['delay'] nearest_bound_in_frame = librosa.samples_to_frames([nearest_bound])[0] delay_in_secs = delay_curve[nearest_bound_in_frame] delay_in_samps = int(delay_in_secs * gs.sr) delay_in_samps += offset # gs.audio_buffer[nearest_bound + delay_in_samps: nearest_bound + delay_in_samps + (dur*gs.sr)] += ((0.8*echo_amp) * window(clip)) gs.audio_buffer[nearest_bound + delay_in_samps:nearest_bound + delay_in_samps + (dur * gs.sr)] += ((echo_amp) * window(clip)) # level 2 - alert sample else: # issue sampled alert alert = param_dict['alert'] if len(alert) > sig_dur * gs.sr: alert = alert[:sig_dur * gs.sr] remainder = np.concatenate( (square_window(alert), gs.audio_buffer[nearest_bound + len(alert):])) gs.audio_buffer[nearest_bound:nearest_bound + len(remainder)] = remainder taper_buffer_edges(nearest_bound, nearest_bound + len(alert), 1.0, low_end=0.0) print "Classical modification completed.." return True
def extract_features(sample, training=True): """Extracts features from sample""" X_list = [] y_list = [] print(sample.label) print(sample.file_name) snd = parselmouth.Sound(os.path.join(sample.path, sample.file_name)) # Onset Detection sample.sig = np.insert(sample.sig, 0, np.zeros(1024)) sample.sig = librosa.util.normalize(sample.sig) onset_frame = librosa.onset.onset_detect(y=sample.sig, sr=sample.fs, units='frames', backtrack=False, pre_max=20000, post_max=20000, pre_avg=20000, post_avg=20000, delta=0.0, wait=1000) offset_frame = int( round(0.75 * librosa.samples_to_frames(samples=sample.sig.shape[0]))) if offset_frame - 32 <= onset_frame[0]: # or not training offset_frame = librosa.samples_to_frames(samples=sample.sig.shape[0]) if training: print( 'Training Sample shorter than 32 Frames {}; Sample Length: {}'. format(sample.file_name, offset_frame - onset_frame[0])) if offset_frame - 32 <= onset_frame[0]: onset_frame[0] = 0 onset_sample = librosa.core.frames_to_samples(onset_frame[0]) offset_sample = librosa.core.frames_to_samples(offset_frame) # plots.waveform(sample, onset_sample) smp_cut = sample.sig[onset_sample:offset_sample] # Randomly shorten sample from 1/4 to 3/4 note length at 120 BPM # smp_cut = smp_cut[:int(np.random.uniform(0.66, 1, [1, 1])[0, 0]*len(smp_cut))] # Add noise to sample at max -48dBFS # smp_cut += np.random.uniform(-2**-9, 2**-9, [len(smp_cut)]) # Time series features from librosa mfcc = librosa.feature.mfcc(y=smp_cut, sr=sample.fs) # plots.spectrogram(smp_cut) mfcc_pos = (mfcc - np.min(mfcc)) mfcc_norm = mfcc_pos / np.max(mfcc_pos) - np.mean(mfcc_pos) mfcc_delta = librosa.feature.delta(mfcc_norm) spec_contr = librosa.feature.spectral_contrast(y=smp_cut, sr=sample.fs) # plots.spec_contrast(spec_contr) phase_res = phase_fmax(smp_cut) # plots.phase_reg_line_deviation(phase_res) zero_cr = librosa.feature.zero_crossing_rate(y=smp_cut) zero_cr_delta = librosa.feature.delta(zero_cr) rms = librosa.feature.rms(y=smp_cut) rms *= 1 / rms.max() rms_delta = librosa.feature.delta(rms) # Time series features from praat pitch = snd.to_pitch().to_array() pitch_curve, voice_prob = zip(*pitch[0][:]) pitch_curve = np.array(pitch_curve) voice_prob = np.array(voice_prob) pitch_onset = int( (onset_sample / sample.sig.shape[0]) * pitch_curve.shape[0]) pitch_curve = pitch_curve[pitch_onset:] voice_prob = voice_prob[pitch_onset:] # plots.pitch_voiced_curve(pitch_curve, voice_prob) pitch_curve = np.reshape(pitch_curve, [1, pitch_curve.shape[0]]) # plots.pitch(pitch_curve) voice_prob = np.reshape(voice_prob, [1, voice_prob.shape[0]]) harmonicity = call(snd, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0) hnr = call(harmonicity, "Get mean", 0, 0) m_features = np.concatenate((mfcc_norm, mfcc_delta, spec_contr, zero_cr, zero_cr_delta, rms, rms_delta)) v_features = functionals(m_features) # phase_res and pitch curve have different lenghts from m_features, so functionals # need to be analysed individually v_features = np.append(v_features, functionals(phase_res)) v_features = np.append(v_features, functionals(pitch_curve)) v_features = np.append(v_features, functionals(voice_prob)) v_features = np.append(v_features, hnr) X_list.append(v_features) y_list.append(sample.label) return X_list, y_list