def HPFilter(audio, cutoff): HPF = es.HighPass(cutoffFrequency=cutoff) filtered_audio = HPF(audio) writer = es.MonoWriter(filename='holst_test.wav') writer(filtered_audio) return filtered_audio
def filter_loops(): loops = os.listdir(CHOPPED_PATH) proc_loops = os.listdir(EQ_NEW_PATH) lp_filter = es.LowPass(cutoffFrequency=90,sampleRate=sampleRate) bp_filter = es.BandPass(bandwidth=100 ,cutoffFrequency=280,sampleRate=sampleRate) hp_filter = es.HighPass(cutoffFrequency=9000,sampleRate=sampleRate) i=0 for loop in loops: i=i+1 if i % 50 == 0: print(str(i)) if ".wav" in loop: if ("bpf_" + loop) not in proc_loops: audio_file=es.MonoLoader(filename=CHOPPED_PATH+loop,sampleRate=sampleRate) #lpf_audio = lp_filter(audio_file()) bpf_audio = bp_filter(audio_file()) #hpf_audio = hp_filter(audio_file()) #sf.write(EQ_PATH + "lpf_" + loop, lpf_audio, sampleRate) sf.write(EQ_NEW_PATH + "bpf_" + loop, bpf_audio, sampleRate)
def filter_loops_eval(): loops_paths = [ "icassp2021_outputs/outputs_stft_coherence/", "icassp2021_outputs/outputs_wavstft_coherence/"] lp_filter = es.LowPass(cutoffFrequency=90,sampleRate=sampleRate) bp_filter = es.BandPass(bandwidth=100 ,cutoffFrequency=280,sampleRate=sampleRate) hp_filter = es.HighPass(cutoffFrequency=9000,sampleRate=sampleRate) for path in loops_paths: loops = os.listdir(path) for loop in loops: if ".wav" in loop: audio_file=es.MonoLoader(filename=path+loop,sampleRate=sampleRate) if "lpf" in loop: lpf_audio = lp_filter(audio_file()) sf.write(path + "eq/" + loop, lpf_audio, sampleRate) if "bpf" in loop: bpf_audio = bp_filter(audio_file()) sf.write(path + "eq/" + loop, bpf_audio, sampleRate) if "hpf" in loop: hpf_audio = hp_filter(audio_file()) sf.write(path + "eq/" + loop, hpf_audio, sampleRate)
import essentia.standard as ess import numpy as np import pickle import glob import utilFunctions as UF import scipy.spatial.distance as DS import parameters as params import csv rms=ess.RMS() window = ess.Windowing(type = "hamming") spec = ess.Spectrum(size=params.Nfft) zz = np.zeros((params.zeropadLen,), dtype = 'float32') genmfcc = ess.MFCC(highFrequencyBound = 22000.0, inputSize = params.Nfft/2+1, sampleRate = params.Fs) hps = ess.HighPass(cutoffFrequency = 240.0) onsets = ess.Onsets() strokeLabels = ['dha', 'dhen', 'dhi', 'dun', 'ge', 'kat', 'ke', 'na', 'ne', 're', 'tak', 'te', 'tit', 'tun'] taals = {"teen": {"nmatra": 16, "accents": np.array([4, 1, 1, 1, 3, 1, 1, 1, 2, 1, 1, 1, 3, 1, 1, 1])}, "ek": {"nmatra": 12, "accents": np.array([4, 1, 1, 2, 1, 1, 3, 1, 1, 2, 1, 1])}, "jhap": {"nmatra": 10, "accents": np.array([4, 1, 2, 1, 1, 3, 1, 2, 1, 1])}, "rupak": {"nmatra": 7, "accents": np.array([2, 1, 1, 3, 1, 3, 1])} } rolls = [{"bol": ['dha/dha_02', 'te/te_05', 're/re_04', 'dha/dha_02'], "dur": np.array([1.0, 1.0, 1, 1]), "amp": np.array([1.0, 1.0, 1.0, 1.0])}, {"bol": ['te/te_02', 're/re_05', 'ke/ke_04', 'te/te_02'], "dur": np.array([1.0, 1.0, 1, 1]), "amp": np.array([1.0, 1.0, 1.0, 1.0])}, {"bol": ['ge/ge_02', 'ge/ge_05', 'te/te_04', 'te/te_02'], "dur": np.array([1.0, 1.0, 1, 1]), "amp": np.array([1.0, 1.0, 1.0, 1.0])}, {"bol": ['ge/ge_02', 'ge/ge_05', 'dhi/dhi_04', 'na/na_02'], "dur": np.array([1.0, 1.0, 1, 1]), "amp": np.array([1.0, 1.0, 1.0, 1.0])}, {"bol": ['dha/dha_02', 'dha/dha_02', 'te/te_05', 'te/te_06'], "dur": np.array([1.0, 1.0, 1, 1]), "amp": np.array([1.0, 1.0, 1.0, 1.0])},
def estimate_key(input_audio_file, output_text_file=None, key_profile=None): """ This function estimates the overall key of an audio track optionaly with extra modal information. :type input_audio_file: str :type output_text_file: str """ if key_profile is not None: global USE_THREE_PROFILES global WITH_MODAL_DETAILS global KEY_PROFILE KEY_PROFILE = key_profile USE_THREE_PROFILES = False WITH_MODAL_DETAILS = False loader = estd.MonoLoader(filename=input_audio_file, sampleRate=SAMPLE_RATE) cut = estd.FrameCutter(frameSize=WINDOW_SIZE, hopSize=HOP_SIZE) window = estd.Windowing(size=WINDOW_SIZE, type=WINDOW_SHAPE) rfft = estd.Spectrum(size=WINDOW_SIZE) sw = estd.SpectralWhitening(maxFrequency=MAX_HZ, sampleRate=SAMPLE_RATE) speaks = estd.SpectralPeaks(magnitudeThreshold=SPECTRAL_PEAKS_THRESHOLD, maxFrequency=MAX_HZ, minFrequency=MIN_HZ, maxPeaks=SPECTRAL_PEAKS_MAX, sampleRate=SAMPLE_RATE) hpcp = estd.HPCP( bandPreset=HPCP_BAND_PRESET, #bandSplitFrequency=HPCP_SPLIT_HZ, harmonics=HPCP_HARMONICS, maxFrequency=MAX_HZ, minFrequency=MIN_HZ, nonLinear=HPCP_NON_LINEAR, normalized=HPCP_NORMALIZE, referenceFrequency=HPCP_REFERENCE_HZ, sampleRate=SAMPLE_RATE, size=HPCP_SIZE, weightType=HPCP_WEIGHT_TYPE, windowSize=HPCP_WEIGHT_WINDOW_SEMITONES, maxShifted=HPCP_SHIFT) if HIGHPASS_CUTOFF is not None: hpf = estd.HighPass(cutoffFrequency=HIGHPASS_CUTOFF, sampleRate=SAMPLE_RATE) audio = hpf(hpf(hpf(loader()))) else: audio = loader() duration = len(audio) n_slices = 1 + (duration // HOP_SIZE) chroma = np.empty([n_slices, HPCP_SIZE], dtype='float64') for slice_n in range(n_slices): spek = rfft(window(cut(audio))) p1, p2 = speaks(spek) if SPECTRAL_WHITENING: p2 = sw(spek, p1, p2) pcp = hpcp(p1, p2) if not DETUNING_CORRECTION or DETUNING_CORRECTION_SCOPE == 'average': chroma[slice_n] = pcp elif DETUNING_CORRECTION and DETUNING_CORRECTION_SCOPE == 'frame': pcp = shift_pcp(pcp, HPCP_SIZE) chroma[slice_n] = pcp else: raise NameError("SHIFT_SCOPE must be set to 'frame' or 'average'.") chroma = np.sum(chroma, axis=0) if PCP_THRESHOLD is not None: chroma = normalize_pcp_peak(chroma) chroma = pcp_gate(chroma, PCP_THRESHOLD) if DETUNING_CORRECTION and DETUNING_CORRECTION_SCOPE == 'average': chroma = shift_pcp(chroma, HPCP_SIZE) chroma = np.roll( chroma, -3) # Adjust to essentia's HPCP calculation starting on A... if USE_THREE_PROFILES: estimation_1 = template_matching_3(chroma, KEY_PROFILE) else: estimation_1 = template_matching_2(chroma, KEY_PROFILE) key_1 = estimation_1[0] + '\t' + estimation_1[1] correlation_value = estimation_1[2] if WITH_MODAL_DETAILS: estimation_2 = template_matching_modal(chroma) key_2 = estimation_2[0] + '\t' + estimation_2[1] key_verbose = key_1 + '\t' + key_2 key = key_verbose.split('\t') # Assign monotonic tracks to minor: if key[3] == 'monotonic' and key[0] == key[2]: key = '{0}\tminor'.format(key[0]) else: key = key_1 else: key = key_1 if output_text_file is not None: textfile = open(output_text_file, 'w') textfile.write(key + '\t' + str(correlation_value) + '\n') textfile.close() return key, correlation_value
def key_aes(input_audio_file, output_text_file, **kwargs): """ This function estimates the overall key of an audio track optionally with extra modal information. :type input_audio_file: str :type output_text_file: str """ if not kwargs: kwargs = KEY_SETTINGS loader = estd.MonoLoader(filename=input_audio_file, sampleRate=kwargs["SAMPLE_RATE"]) cut = estd.FrameCutter(frameSize=kwargs["WINDOW_SIZE"], hopSize=kwargs["HOP_SIZE"]) window = estd.Windowing(size=kwargs["WINDOW_SIZE"], type=kwargs["WINDOW_SHAPE"]) rfft = estd.Spectrum(size=kwargs["WINDOW_SIZE"]) sw = estd.SpectralWhitening(maxFrequency=kwargs["MAX_HZ"], sampleRate=kwargs["SAMPLE_RATE"]) speaks = estd.SpectralPeaks( magnitudeThreshold=kwargs["SPECTRAL_PEAKS_THRESHOLD"], maxFrequency=kwargs["MAX_HZ"], minFrequency=kwargs["MIN_HZ"], maxPeaks=kwargs["SPECTRAL_PEAKS_MAX"], sampleRate=kwargs["SAMPLE_RATE"]) hpcp = estd.HPCP(bandPreset=kwargs["HPCP_BAND_PRESET"], splitFrequency=kwargs["HPCP_SPLIT_HZ"], harmonics=kwargs["HPCP_HARMONICS"], maxFrequency=kwargs["MAX_HZ"], minFrequency=kwargs["MIN_HZ"], nonLinear=kwargs["HPCP_NON_LINEAR"], normalized=kwargs["HPCP_NORMALIZE"], referenceFrequency=kwargs["HPCP_REFERENCE_HZ"], sampleRate=kwargs["SAMPLE_RATE"], size=kwargs["HPCP_SIZE"], weightType=kwargs["HPCP_WEIGHT_TYPE"], windowSize=kwargs["HPCP_WEIGHT_WINDOW_SEMITONES"], maxShifted=kwargs["HPCP_SHIFT"]) audio = loader() if kwargs["HIGHPASS_CUTOFF"] is not None: hpf = estd.HighPass(cutoffFrequency=kwargs["HIGHPASS_CUTOFF"], sampleRate=kwargs["SAMPLE_RATE"]) audio = hpf(hpf(hpf(audio))) if kwargs["DURATION"] is not None: audio = audio[(kwargs["START_TIME"] * kwargs["SAMPLE_RATE"]):(kwargs["DURATION"] * kwargs["SAMPLE_RATE"])] duration = len(audio) number_of_frames = int(duration / kwargs["HOP_SIZE"]) chroma = [] for bang in range(number_of_frames): spek = rfft(window(cut(audio))) p1, p2 = speaks(spek) if kwargs["SPECTRAL_WHITENING"]: p2 = sw(spek, p1, p2) pcp = hpcp(p1, p2) if np.sum(pcp) > 0: if not kwargs["DETUNING_CORRECTION"] or kwargs[ "DETUNING_CORRECTION_SCOPE"] == 'average': chroma.append(pcp) elif kwargs["DETUNING_CORRECTION"] and kwargs[ "DETUNING_CORRECTION_SCOPE"] == 'frame': pcp = _detuning_correction(pcp, kwargs["HPCP_SIZE"]) chroma.append(pcp) else: raise NameError( "SHIFT_SCOPE musts be set to 'frame' or 'average'.") if not chroma: return 'Silence' chroma = np.sum(chroma, axis=0) chroma = norm_peak(chroma) if kwargs["PCP_THRESHOLD"] is not None: chroma = vector_threshold(chroma, kwargs["PCP_THRESHOLD"]) if kwargs["DETUNING_CORRECTION"] and kwargs[ "DETUNING_CORRECTION_SCOPE"] == 'average': chroma = _detuning_correction(chroma, kwargs["HPCP_SIZE"]) # Adjust to essentia's HPCP calculation starting on A (pc = 9) chroma = np.roll(chroma, -3 * (kwargs["HPCP_SIZE"] // 12)) estimation_1 = estimate_key(chroma, kwargs["KEY_PROFILE"], kwargs["PROFILE_INTERPOLATION"], conf_thres=kwargs["NOKEY_THRESHOLD"], vocabulary=kwargs["KEY_VOCABULARY"]) key_1 = estimation_1[0] correlation_value = estimation_1[1] if kwargs["WITH_MODAL_DETAILS"]: estimation_2 = _key7(chroma, kwargs["PROFILE_INTERPOLATION"]) key_2 = estimation_2[0] + '\t' + estimation_2[1] key_verbose = key_1 + '\t' + key_2 key = key_verbose.split('\t') # Assign monotonic track to minor: if key[3] == 'monotonic' and key[0] == key[2]: key = '{0}\tminor'.format(key[0]) else: key = key_1 else: key = key_1 textfile = open(output_text_file, 'w') textfile.write(key) textfile.close() return key, correlation_value
def key_ecir(input_audio_file, output_text_file, **kwargs): if not kwargs: kwargs = KEY_SETTINGS loader = estd.MonoLoader(filename=input_audio_file, sampleRate=kwargs["SAMPLE_RATE"]) cut = estd.FrameCutter(frameSize=kwargs["WINDOW_SIZE"], hopSize=kwargs["HOP_SIZE"]) window = estd.Windowing(size=kwargs["WINDOW_SIZE"], type=kwargs["WINDOW_SHAPE"]) rfft = estd.Spectrum(size=kwargs["WINDOW_SIZE"]) sw = estd.SpectralWhitening(maxFrequency=kwargs["MAX_HZ"], sampleRate=kwargs["SAMPLE_RATE"]) speaks = estd.SpectralPeaks( magnitudeThreshold=kwargs["SPECTRAL_PEAKS_THRESHOLD"], maxFrequency=kwargs["MAX_HZ"], minFrequency=kwargs["MIN_HZ"], maxPeaks=kwargs["SPECTRAL_PEAKS_MAX"], sampleRate=kwargs["SAMPLE_RATE"]) hpcp = estd.HPCP(bandPreset=kwargs["HPCP_BAND_PRESET"], splitFrequency=kwargs["HPCP_SPLIT_HZ"], harmonics=kwargs["HPCP_HARMONICS"], maxFrequency=kwargs["MAX_HZ"], minFrequency=kwargs["MIN_HZ"], nonLinear=kwargs["HPCP_NON_LINEAR"], normalized=kwargs["HPCP_NORMALIZE"], referenceFrequency=kwargs["HPCP_REFERENCE_HZ"], sampleRate=kwargs["SAMPLE_RATE"], size=kwargs["HPCP_SIZE"], weightType=kwargs["HPCP_WEIGHT_TYPE"], windowSize=kwargs["HPCP_WEIGHT_WINDOW_SEMITONES"], maxShifted=kwargs["HPCP_SHIFT"]) key = estd.Key(numHarmonics=kwargs["KEY_HARMONICS"], pcpSize=kwargs["HPCP_SIZE"], profileType=kwargs["KEY_PROFILE"], slope=kwargs["KEY_SLOPE"], usePolyphony=kwargs["KEY_POLYPHONY"], useThreeChords=kwargs["KEY_USE_THREE_CHORDS"]) audio = loader() if kwargs["HIGHPASS_CUTOFF"] is not None: hpf = estd.HighPass(cutoffFrequency=kwargs["HIGHPASS_CUTOFF"], sampleRate=kwargs["SAMPLE_RATE"]) audio = hpf(hpf(hpf(audio))) if kwargs["DURATION"] is not None: audio = audio[(kwargs["START_TIME"] * kwargs["SAMPLE_RATE"]):(kwargs["DURATION"] * kwargs["SAMPLE_RATE"])] duration = len(audio) number_of_frames = int(duration / kwargs["HOP_SIZE"]) chroma = [] for bang in range(number_of_frames): spek = rfft(window(cut(audio))) p1, p2 = speaks(spek) # p1 = frequencies; p2 = magnitudes if kwargs["SPECTRAL_WHITENING"]: p2 = sw(spek, p1, p2) vector = hpcp(p1, p2) sum_vector = np.sum(vector) if sum_vector > 0: if kwargs["DETUNING_CORRECTION"] == False or kwargs[ "DETUNING_CORRECTION_SCOPE"] == 'average': chroma.append(vector) elif kwargs["DETUNING_CORRECTION"] and kwargs[ "DETUNING_CORRECTION_SCOPE"] == 'frame': vector = _detuning_correction(vector, kwargs["HPCP_SIZE"]) chroma.append(vector) else: print("SHIFT_SCOPE must be set to 'frame' or 'average'") chroma = np.mean(chroma, axis=0) if kwargs["DETUNING_CORRECTION"] and kwargs[ "DETUNING_CORRECTION_SCOPE"] == 'average': chroma = _detuning_correction(chroma, kwargs["HPCP_SIZE"]) key = key(chroma.tolist()) confidence = (key[2], key[3]) key = key[0] + '\t' + key[1] textfile = open(output_text_file, 'w') textfile.write(key + '\n') textfile.close() return key, confidence
def analysis_function(loop, sampleRate=16000): lp_filter = es.LowPass(cutoffFrequency=90, sampleRate=sampleRate) bp_filter = es.BandPass(bandwidth=20, cutoffFrequency=280, sampleRate=sampleRate) hp_filter = es.HighPass(cutoffFrequency=9000, sampleRate=sampleRate) [_, pattern] = ADT([loop], output_act='yes', tab='no', save_dir="analysis/") pattern = np.array(pattern)[0] time_audio = np.linspace(0, float(29538) / 16000, 29538) time_act = np.linspace(0, float(29538) / 16000, 160) final_pattern = np.clip( np.array([ interp1d(time_act, pattern[0, :, 0])(time_audio), interp1d(time_act, pattern[1, :, 0])(time_audio), interp1d(time_act, pattern[2, :, 0])(time_audio) ]).T, 0.0, 1.0) final_pattern = final_pattern / final_pattern.max(axis=0) final_pattern = np.expand_dims(final_pattern, 0) audio_file = es.MonoLoader(filename=loop, sampleRate=sampleRate) loop_basename = ntpath.basename(loop) lpf_audio = lp_filter(audio_file()) bpf_audio = bp_filter(audio_file()) hpf_audio = hp_filter(audio_file()) sf.write("analysis/lpf_" + loop_basename, lpf_audio, sampleRate) sf.write("analysis/bpf_" + loop_basename, bpf_audio, sampleRate) sf.write("analysis/hpf_" + loop_basename, hpf_audio, sampleRate) unordered_kick_features = timbral_models.timbral_extractor( "analysis/lpf_" + loop_basename, clip_output=True) unordered_snare_features = timbral_models.timbral_extractor( "analysis/bpf_" + loop_basename, clip_output=True) unordered_hh_features = timbral_models.timbral_extractor("analysis/hpf_" + loop_basename, clip_output=True) features_kick = [ unordered_kick_features['warmth'] / 69.738235, unordered_kick_features['roughness'] / 71.95989, unordered_kick_features['brightness'] / 82.336105, unordered_kick_features['hardness'] / 75.53646, unordered_kick_features['boominess'] / 71.00043, unordered_kick_features['depth'] / 100.0, unordered_kick_features['sharpness'] / 81.7323, ] features_snare = [ unordered_snare_features['warmth'] / 69.57681, unordered_snare_features['roughness'] / 67.66642, unordered_snare_features['brightness'] / 80.19115, unordered_snare_features['hardness'] / 71.689445, unordered_snare_features['boominess'] / 61.422714, unordered_snare_features['depth'] / 100.0, unordered_snare_features['sharpness'] / 71.406494 ] features_hh = [ unordered_hh_features['warmth'] / 32.789112, unordered_hh_features['roughness'] / 1.0, unordered_hh_features['brightness'] / 85.24432, unordered_hh_features['hardness'] / 67.71172, unordered_hh_features['boominess'] / 2.491137, unordered_hh_features['depth'] / 0.5797179, unordered_hh_features['sharpness'] / 87.83693 ] hpcp = file_to_hpcp(audio_file()) #[69.57681, 67.66642, 80.19115, 71.689445, 61.422714, 100.0, 71.406494] #[32.789112, 1.0, 85.24432, 67.71172, 2.491137, 0.5797179, 87.83693] #[69.738235, 71.95989, 82.336105, 75.53646, 71.00043, 100.0, 81.7323] return final_pattern, hpcp, features_kick, features_snare, np.clip( features_hh, 0, 1)