def process_csv_data(i, file_path, cqt_length): print('[{}] Processing CSV data...'.format(i)) result = np.array([]) with open(file_path) as csvfile: reader = csv.reader(csvfile, delimiter='\t') # Skip header next(reader) row_count = sum(1 for row in reader) sets = np.zeros([row_count, 3]) csvfile.seek(0) next(reader) for i, row in enumerate(reader): if len(row) != 3: continue onset = row[0] offset = row[1] midi_pitch = row[2] sets[i] = [onset, offset, midi_pitch] result = np.zeros([PITCH_RANGE * 2, cqt_length]) time_stamps = frames_to_time(range(0, cqt_length + 1), sr=TARGET_SAMPLE_RATE, hop_length=HOP_LENGTH) process_csv_data_jit(sets, time_stamps, cqt_length, result) return result
def index(): form = Params(request.form) # if user submit the form if (request.method == 'POST' and form.validate()): # Get data from form extratorAlgorithm, filename, centroids_number, musicPath = getFormData( ) # Extract features, then normalize matrix = getFeatureMatrix(extratorAlgorithm, filename) matrix_norm = normalize(matrix) # Get the centroids and calculate the distance to all frames centroids, distancesCentroid = getCentroids(matrix_norm, centroids_number) distancesCentroid = np.array(distancesCentroid) # Define the number of closest frames from centroids to get centroidClosestPoints = 5 # Get the ID of frames closest to each centroid closest_frames_idx = distancesCentroid.argsort(axis=0).T.tolist() centroid_frame_counts = np.bincount( np.argmin(distancesCentroid, axis=1)) # Get the time from each frame tempos = frames_to_time(closest_frames_idx, sr=44100, hop_length=1024, n_fft=2048) tempos = tempos.tolist() # Apply some transformations pca = reduceDimensionality(matrix_norm) matrix_norm = pca.transform(matrix_norm) centroids = pca.transform(centroids) # Generate and encode graph to send to view graph = Graph( matrix_norm, centroids, filename, centroid_frame_count=centroid_frame_counts).generateGraph() print tempos # Send all structures to the view return render_template('index.html', graph=graph, form=form, musicPath=musicPath, tempos=tempos) # Generate an empty graphic graph = Graph().generateGraph() return render_template('index.html', graph=graph, form=form)
def __coord_time(n, sr=22050, hop_length=512, **_kwargs): """Get time coordinates from frames""" return core.frames_to_time(np.arange(n + 1), sr=sr, hop_length=hop_length)
def onset_detect( y=None, sr=22050, onset_envelope=None, hop_length=512, backtrack=False, energy=None, units="frames", normalize=True, **kwargs ): """Basic onset detector. Locate note onset events by picking peaks in an onset strength envelope. Modified from `librosa.onset.onset_detect` to add a `normalize` flag. The `peak_pick` parameters were chosen by large-scale hyper-parameter optimization over the dataset provided by [1]_. .. [1] https://github.com/CPJKU/onset_db Parameters ---------- y : np.ndarray [shape=(n,)] audio time series sr : number > 0 [scalar] sampling rate of `y` onset_envelope : np.ndarray [shape=(m,)] (optional) pre-computed onset strength envelope hop_length : int > 0 [scalar] hop length (in samples) units : {'frames', 'samples', 'time'} The units to encode detected onset events in. By default, 'frames' are used. backtrack : bool If `True`, detected onset events are backtracked to the nearest preceding minimum of `energy`. This is primarily useful when using onsets as slice points for segmentation. energy : np.ndarray [shape=(m,)] (optional) An energy function to use for backtracking detected onset events. If none is provided, then `onset_envelope` is used. noramlize : bool (optional) If `True`, normalize the onset envelope before peak picking. By default this parameter is `True`. kwargs : additional keyword arguments Additional parameters for peak picking. See `librosa.util.peak_pick` for details. Returns ------- onsets : np.ndarray [shape=(n_onsets,)] estimated positions of detected onsets, in whichever units are specified. By default, frame indices. .. note:: If no onset strength could be detected, onset_detect returns an empty list. Raises ------ ParameterError if neither `y` nor `onsets` are provided or if `units` is not one of 'frames', 'samples', or 'time' """ # First, get the frame->beat strength profile if we don't already have one if onset_envelope is None: if y is None: raise ParameterError("y or onset_envelope must be provided") onset_envelope = onset_strength(y=y, sr=sr, hop_length=hop_length) # Shift onset envelope up to be non-negative # (a common normalization step to make the threshold more consistent) onset_envelope -= onset_envelope.min() # Do we have any onsets to grab? if not onset_envelope.any(): return np.array([], dtype=np.int) if normalize: # Normalize onset strength function to [0, 1] range onset_envelope /= onset_envelope.max() # These parameter settings found by large-scale search kwargs.setdefault("pre_max", 0.03 * sr // hop_length) # 30ms kwargs.setdefault("post_max", 0.00 * sr // hop_length + 1) # 0ms kwargs.setdefault("pre_avg", 0.10 * sr // hop_length) # 100ms kwargs.setdefault("post_avg", 0.10 * sr // hop_length + 1) # 100ms kwargs.setdefault("wait", 0.03 * sr // hop_length) # 30ms kwargs.setdefault("delta", 0.07) # Peak pick the onset envelope onsets = util.peak_pick(onset_envelope, **kwargs) # Optionally backtrack the events if backtrack: if energy is None: energy = onset_envelope onsets = onset_backtrack(onsets, energy) if units == "frames": pass elif units == "samples": onsets = core.frames_to_samples(onsets, hop_length=hop_length) elif units == "time": onsets = core.frames_to_time(onsets, hop_length=hop_length, sr=sr) else: raise ParameterError("Invalid unit type: {}".format(units)) return onsets
3464.84375, 3468.75, 3472.65625, 3476.5625, 3480.46875, 3484.375, 3488.28125, 3492.1875, 3496.09375, 3500.0, 3503.90625, 3507.8125, 3511.71875, 3515.625, 3519.53125, 3523.4375, 3527.34375, 3531.25, 3535.15625, 3539.0625, 3542.96875, 3546.875, 3550.78125, 3554.6875, 3558.59375, 3562.5, 3566.40625, 3570.3125, 3574.21875, 3578.125, 3582.03125, 3585.9375, 3589.84375, 3593.75, 3597.65625, 3601.5625, 3605.46875, 3609.375, 3613.28125, 3617.1875, 3621.09375, 3625.0, 3628.90625, 3632.8125, 3636.71875, 3640.625, 3644.53125, 3648.4375, 3652.34375, 3656.25, 3660.15625, 3664.0625, 3667.96875, 3671.875, 3675.78125, 3679.6875, 3683.59375, 3687.5, 3691.40625, 3695.3125, 3699.21875, 3703.125, 3707.03125, 3710.9375, 3714.84375, 3718.75, 3722.65625, 3726.5625, 3730.46875, 3734.375, 3738.28125, 3742.1875, 3746.09375 ] frequency_strength_thr = 1e2 times_of_bins = lambda hm_steps: frames_to_time(range( 0, hm_steps), sample_rate, fft_hop_len, fft_bins) zscore_scale = True minmax_scale = False log_scale = False data_path = 'data' dev_ratio = 0 ## model params timestep_size = len(frequencies_to_pick) in_size = timestep_size out_size = timestep_size
def frame_level_predict(model_name, filename, cache=True, plot=False): """ Predict Voice Activity Regions at a Frame Level for a given song. For each frame of the MFCC a Voice Detection Probability is predicted, then the output have shape: (n_frames, 1) :param model_name: name of the trained model :param filename: path to the music file to be predicted :param cache: flag to optimize heavy operations with caching in disk :param plot: flag to plot MFCCs and SVD in an aligned plot if GUI available. :return: (Time, Predictions): SVD probabilities at frame level with time markings """ audio_name = filename.parts[-1] audio_name_prefix = '.'.join(filename.parts[:-1]) serialized_filename = PREDICTIONS_DIR / '{}.{}.{}.csv'.format( audio_name_prefix, audio_name, model_name) mel = process_single_audio(filename, cache=cache) try: if not cache: raise IOError data = np.loadtxt(serialized_filename, delimiter=',') time = data[0] frame_level_y_pred = data[1] print("info: loaded serialized prediction") except Exception: # transform raw predictions to frame level y_pred = predict_song(model_name, filename, cache=cache) aligned_y_pred = [[] for _ in range(mel.shape[1])] for first_frame_idx, window_prediction in enumerate(y_pred): # for each prediction for offset, frame_prediction in enumerate(window_prediction): # accumulate overlapped predictions in a list aligned_y_pred[first_frame_idx + offset].append( frame_prediction[0]) frame_level_y_pred = [] for _, predictions in enumerate(aligned_y_pred[:-1]): # reduce the overlapped predictions to a single value frame_level_y_pred.append(min(predictions)) time = frames_to_time(range(len(frame_level_y_pred)), sr=SR_HPSS, n_fft=N_FFT_HPSS_2, hop_length=N_HOP_HPSS_2) np.savetxt(serialized_filename, np.asarray((time, frame_level_y_pred)), delimiter=",") print("info: saved serialized prediction") if plot: import matplotlib.pyplot as plt import librosa.display # plot stacked MFCCs plt.figure(figsize=(14, 5)) plt.subplot(211) librosa.display.specshow(mel, sr=SR_HPSS, x_axis='time', y_axis='hz', hop_length=N_HOP_HPSS_2) # plot frame level predictions plt.subplot(313) plt.plot(time, frame_level_y_pred) plt.xlabel("Time") plt.ylabel("Singing Voice Activation") plt.show() print("info: plotted") print('info: done') return time, frame_level_y_pred