def _rescale_notes(intervals, pitches, frames): """ Rescale notes to from raw prediction results to make MIDI. Parameters ---------- intervals: Intervals from extract_notes. pitches: Pitches from extract_notes. frames: Frame prediciton result. Returns ------- re_intervals: Rescaled intervals. re_pitches: Rescaled pitches. re_times: Rescaled times. re_freqs: Rescaled frequencies. """ scaling = HOP_LENGTH / SAMPLE_RATE re_intervals = (intervals * scaling).reshape(-1, 2) re_pitches = np.array([midi_to_hz(MIN_MIDI + midi) for midi in pitches]) times, freqs = notes_to_frames(pitches, intervals, frames) re_times = times.astype(np.float64) * scaling re_freqs = [ np.array([midi_to_hz(MIN_MIDI + midi) for midi in freq]) for freq in freqs ] return re_intervals, re_pitches, re_times, re_freqs
def eval_one_data(answer_true, answer_pred, onset_tolerance=0.05): ref_pitches = [] est_pitches = [] ref_intervals = [] est_intervals = [] for i in range(len(answer_true)): if answer_true[i] is not None and float(answer_true[i][1]) - float( answer_true[i][0]) > 0: ref_intervals.append([answer_true[i][0], answer_true[i][1]]) ref_pitches.append(answer_true[i][2]) for i in range(len(answer_pred)): if answer_pred[i] is not None and float(answer_pred[i][1]) - float( answer_pred[i][0]) > 0: est_intervals.append([answer_pred[i][0], answer_pred[i][1]]) est_pitches.append(answer_pred[i][2]) ref_intervals = np.array(ref_intervals) est_intervals = np.array(est_intervals) ref_pitches = np.array( [float(ref_pitches[i]) for i in range(len(ref_pitches))]) est_pitches = np.array( [float(est_pitches[i]) for i in range(len(est_pitches))]) ref_pitches = util.midi_to_hz(ref_pitches) est_pitches = util.midi_to_hz(est_pitches) if len(est_intervals) == 0: ret = np.zeros(14) ret[9] = len(ref_pitches) return ret raw_data = transcription.evaluate(ref_intervals, ref_pitches, est_intervals, est_pitches, onset_tolerance=onset_tolerance, pitch_tolerance=50) ret = np.zeros(14) ret[0] = raw_data['Precision'] ret[1] = raw_data['Recall'] ret[2] = raw_data['F-measure'] ret[3] = raw_data['Precision_no_offset'] ret[4] = raw_data['Recall_no_offset'] ret[5] = raw_data['F-measure_no_offset'] ret[6] = raw_data['Onset_Precision'] ret[7] = raw_data['Onset_Recall'] ret[8] = raw_data['Onset_F-measure'] ret[9] = len(ref_pitches) ret[10] = len(est_pitches) ret[11] = int(round(ret[1] * ret[9])) ret[12] = int(round(ret[4] * ret[9])) ret[13] = int(round(ret[7] * ret[9])) return ret
def evaluate(model, batch, device): metrics = defaultdict(list) batch = allocate_batch(batch, device) frame_logit, onset_logit = model(batch['audio']) criterion = nn.BCEWithLogitsLoss() frame_loss = criterion(frame_logit, batch['frame']) onset_loss = criterion(frame_logit, batch['onset']) metrics['metric/loss/frame_loss'].append(frame_loss.cpu().numpy()) metrics['metric/loss/onset_loss'].append(onset_loss.cpu().numpy()) for n in range(batch['audio'].shape[0]): frame_pred = th.sigmoid(frame_logit[n]) onset_pred = th.sigmoid(onset_logit[n]) pr, re, f1 = framewise_eval(frame_pred, batch['frame'][n]) metrics['metric/frame/frame_precision'].append(pr) metrics['metric/frame/frame_recall'].append(re) metrics['metric/frame/frame_f1'].append(f1) pr, re, f1 = framewise_eval(onset_pred, batch['onset'][n]) metrics['metric/frame/onset_precision'].append(pr) metrics['metric/frame/onset_recall'].append(re) metrics['metric/frame/onset_f1'].append(f1) p_est, i_est = extract_notes(onset_pred, frame_pred) p_ref, i_ref = extract_notes(batch['onset'][n], batch['frame'][n]) scaling = HOP_SIZE / SAMPLE_RATE i_ref = (i_ref * scaling).reshape(-1, 2) p_ref = np.array([midi_to_hz(MIN_MIDI + pitch) for pitch in p_ref]) i_est = (i_est * scaling).reshape(-1, 2) p_est = np.array([midi_to_hz(MIN_MIDI + pitch) for pitch in p_est]) p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est, offset_ratio=None) metrics['metric/note/precision'].append(p) metrics['metric/note/recall'].append(r) metrics['metric/note/f1'].append(f) metrics['metric/note/overlap'].append(o) p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est) metrics['metric/note-with-offsets/precision'].append(p) metrics['metric/note-with-offsets/recall'].append(r) metrics['metric/note-with-offsets/f1'].append(f) metrics['metric/note-with-offsets/overlap'].append(o) return metrics
def eval_one_data(answer_true, answer_pred, onset_tolerance=0.05, shifting=0, gt_pitch_shift=0): ref_intervals, est_intervals, ref_pitches, est_pitches = prepare_data( answer_true, answer_pred, time_shift=shifting) ref_pitches = np.array([ float(ref_pitches[i]) + gt_pitch_shift for i in range(len(ref_pitches)) ]) est_pitches = np.array( [float(est_pitches[i]) for i in range(len(est_pitches))]) ref_pitches = util.midi_to_hz(ref_pitches) est_pitches = util.midi_to_hz(est_pitches) if len(est_intervals) == 0: ret = np.zeros(14) ret[9] = len(ref_pitches) return ret raw_data = transcription.evaluate(ref_intervals, ref_pitches, est_intervals, est_pitches, onset_tolerance=onset_tolerance, pitch_tolerance=50) ret = np.zeros(14) ret[0] = raw_data['Precision'] ret[1] = raw_data['Recall'] ret[2] = raw_data['F-measure'] ret[3] = raw_data['Precision_no_offset'] ret[4] = raw_data['Recall_no_offset'] ret[5] = raw_data['F-measure_no_offset'] ret[6] = raw_data['Onset_Precision'] ret[7] = raw_data['Onset_Recall'] ret[8] = raw_data['Onset_F-measure'] ret[9] = len(ref_pitches) ret[10] = len(est_pitches) ret[11] = int(round(ret[1] * ret[9])) ret[12] = int(round(ret[4] * ret[9])) ret[13] = int(round(ret[7] * ret[9])) # print (ret[13], ret[8]) return ret
def transcribe_file(model_file, flac_paths, save_path, sequence_length, onset_threshold, frame_threshold): device = 'cuda' if torch.cuda.is_available() else 'cpu' model = torch.load(model_file, map_location=device).eval() summary(model) for flac_path in flac_paths: #print(f'Processing {flac_path}...', file=sys.stderr) audio = load_and_process_audio(flac_path, sequence_length, device) predictions = transcribe(model, audio) p_est, i_est, v_est = extract_notes(predictions['onset'], predictions['frame'], predictions['velocity'], onset_threshold, frame_threshold) scaling = HOP_LENGTH / SAMPLE_RATE i_est = (i_est * scaling).reshape(-1, 2) p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est]) os.makedirs(save_path, exist_ok=True) pred_path = os.path.join( save_path, os.path.basename(flac_path) + "." + model_file + '.pred.png') #print(f'saved prediction to path: {pred_path}', file=sys.stderr) save_pianoroll(pred_path, predictions['onset'], predictions['frame']) midi_path = os.path.join( save_path, os.path.basename(flac_path) + "." + model_file + '.pred.mid') #print(f'saved mid to path: {midi_path}', file=sys.stderr) save_midi(midi_path, p_est, i_est, v_est)
def midi_preparation(midifile): midi_data = dict() midi_data['onsets'] = dict() midi_data['offsets'] = dict() midi_data['midipitches'] = dict() # midi notes? midi_data['hz'] = dict() patt = pretty_midi.PrettyMIDI(midifile) midi_data['downbeats'] = patt.get_downbeats() for instrument in patt.instruments: midi_data['onsets'][instrument.name] = [] midi_data['offsets'][instrument.name] = [] midi_data['midipitches'][instrument.name] = [] for note in instrument.notes: midi_data['onsets'][instrument.name].append(note.start) midi_data['offsets'][instrument.name].append(note.end) midi_data['midipitches'][instrument.name].append(note.pitch) p = midi_data['midipitches'][instrument.name] midi_data['hz'][instrument.name] = midi_to_hz(np.array(p)) return midi_data
def transcribe_file(model_file, audio_paths, save_path, sequence_length, onset_threshold, frame_threshold, device): model = torch.load(model_file, map_location=device).eval() summary(model) for i, audio_path in enumerate(audio_paths): print(f'{i+1}/{len(audio_paths)}: Processing {audio_path}...', file=sys.stderr) audio = load_and_process_audio(audio_path, sequence_length, device) predictions = transcribe(model, audio) p_est, i_est, v_est = extract_notes(predictions['onset'], predictions['frame'], predictions['velocity'], onset_threshold, frame_threshold) scaling = HOP_LENGTH / SAMPLE_RATE i_est = (i_est * scaling).reshape(-1, 2) p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est]) os.makedirs(save_path, exist_ok=True) pred_path = os.path.join(save_path, os.path.basename(audio_path) + '.pred.png') save_pianoroll(pred_path, predictions['onset'], predictions['frame']) midi_path = os.path.join(save_path, os.path.basename(audio_path) + '.pred.mid') save_midi(midi_path, p_est, i_est, v_est)
def transcribe(audio, model, args, save_name, max_len): print(f'save_path: {save_name}') audio = audio[:max_len*SAMPLE_RATE] t_audio = th.tensor(audio).to(th.float).cuda() pad_len = math.ceil(len(t_audio) / HOP_SIZE) * HOP_SIZE - len(t_audio) t_audio = th.unsqueeze(F.pad(t_audio, (0, pad_len)), 0) frame_logit, onset_logit = model(t_audio) onset = th.sigmoid(onset_logit[0]) frame = th.sigmoid(frame_logit[0]) p_est, i_est = extract_notes(onset, frame) scaling = HOP_SIZE / SAMPLE_RATE i_est = (i_est * scaling).reshape(-1, 2) p_est = np.array([midi_to_hz(MIN_MIDI + pitch) for pitch in p_est]) numpy_filename = Path(save_name).parent / (Path(save_name).stem + '.npz') np.savez(save_name, onset=onset.cpu().numpy(), frame=frame.cpu().numpy()) midi_filename = Path(save_name).parent / (Path(save_name).stem + '.midi') save_midi(midi_filename, p_est, i_est, [64] * len(p_est)) wav_filename = Path(save_name).parent / (Path(save_name).stem + '.wav') midi_file = pretty_midi.PrettyMIDI(str(midi_filename)) synth_audio = midi_file.fluidsynth(fs=16000) soundfile.write(wav_filename, synth_audio, 16000)
def transcribe(audio, model, args, save_name, max_len): print(f'save_path: {save_name}') audio = audio[:max_len * SAMPLE_RATE] t_audio = th.tensor(audio).to(th.float).cuda() pad_len = math.ceil(len(t_audio) / HOP_SIZE) * HOP_SIZE - len( t_audio ) # To make sure that the total length of audio is multiple of hop size t_audio = th.unsqueeze(F.pad(t_audio, (0, pad_len)), 0) frame_logit, onset_logit = model(t_audio) # Why use sigmoid rather than softmax? : enable multiple notes at a time(poly phonic) onset = th.sigmoid(onset_logit[0]) frame = th.sigmoid(frame_logit[0]) # Get pitch and interval(the length of notes, not the harmonic interval) values p_est, i_est = extract_notes(onset, frame) scaling = HOP_SIZE / SAMPLE_RATE i_est = (i_est * scaling).reshape(-1, 2) p_est = np.array([midi_to_hz(MIN_MIDI + pitch) for pitch in p_est]) # Save onset and frame information into numpy zip numpy_filename = Path(save_name).parent / (Path(save_name).stem + '.npz') np.savez(save_name, onset=onset.cpu().numpy(), frame=frame.cpu().numpy()) # Save MIDI midi_filename = Path(save_name).parent / (Path(save_name).stem + '.midi') save_midi(midi_filename, p_est, i_est, [64] * len(p_est)) # Save Wav using fluidsynth wav_filename = Path(save_name).parent / (Path(save_name).stem + '.wav') midi_file = pretty_midi.PrettyMIDI(str(midi_filename)) synth_audio = midi_file.fluidsynth(fs=16000) soundfile.write(wav_filename, synth_audio, 16000)
def seq_to_mireval_form(seq): i_est = [] p_est = [] for note in seq.notes: i_est.append([note.start_time, note.end_time]) p_est.append(midi_to_hz(note.pitch)) i_est = np.asarray(i_est) p_est = np.asarray(p_est) return p_est, i_est
def simple_decoding_wrapper(onset_probs, frame_probs): th_onset_probs = torch.from_numpy(onset_probs) th_frame_probs = torch.from_numpy(frame_probs) p_ref, i_ref, v_ref = extract_notes(th_onset_probs, th_frame_probs) scaling = 512 / 16000 i_ref = (i_ref * scaling).reshape(-1, 2) p_ref = np.array([midi_to_hz(21 + midi) for midi in p_ref]) return p_ref, i_ref
def evaluate(data, model, onset_threshold=0.5, frame_threshold=0.5, save_path=None): metrics = defaultdict(list) for label in data: pred, losses = model.run_on_batch(label) for key, value in pred.items(): value.squeeze_(0).relu_() p_est, i_est, v_est = extract_notes(pred['onset'], pred['frame'], pred['velocity'], onset_threshold, frame_threshold) t_est, f_est = notes_to_frames(p_est, i_est, pred['frame'].shape) scaling = HOP_LENGTH / SAMPLE_RATE i_est = (i_est * scaling).reshape(-1, 2) p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est]) t_est = t_est.astype(np.float64) * scaling f_est = [ np.array([midi_to_hz(MIN_MIDI + midi) for midi in freqs]) for freqs in f_est ] if save_path is not None: os.makedirs(save_path, exist_ok=True) midi_path = os.path.join( save_path, os.path.basename(label['path']) + '.pred.mid') save_midi(midi_path, p_est, i_est, v_est) return metrics
def extract_label(label_path, label_loader, mapping, cenf, t_unit): """Label extraction function of PatchCNN module. Extracts the label representation required by PatchCNN module. The output dimesions are: patch_length x 2. The second dimension indicates whether there is an active vocal pitch or not of that patch. Small probabilities are assigned to those patch with pitch slightly shifted to augment the sparse label. The probabilities are computed according to the distance of that pitch index to the ground-truth index: 1 / (dist + 1). Parameters ---------- label_path: Path Path to the ground-truth file. label_loader: Label loader that contains ``load_label`` function for parsing the ground-truth file into list :class:`Label` representation. mapping: 2D numpy array The original frequency and time index of patches. See ``omnizart.feature.cfp.extract_patch_cfp`` for more details. cenf: list[float] Center frequencies in Hz of each frequency index. t_unit: float Time unit of each frame in seconds. Returns ------- gt_roll: 2D numpy array A sequence of binary classes, represents whether the patch contains the pitch of vocal. """ labels = label_loader.load_label(label_path) total_len = len(mapping) cenf = np.array(cenf) gt_roll = np.zeros((total_len, 2)) for label in labels: start_tidx = int(round(label.start_time / t_unit)) end_tidx = int(round(label.end_time / t_unit)) frm_start = np.argmin(np.abs(mapping[:, 1] - start_tidx)) frm_end = total_len - np.argmin(np.abs(mapping[::-1, 1] - end_tidx)) cur_hz = midi_to_hz(label.note) pitch_idx = np.argmin(np.abs(cenf - cur_hz)) for idx in range(frm_start, frm_end): dist = abs(mapping[idx, 0] - pitch_idx) prob = 1 / (1 + dist) gt_roll[idx, 1] = prob gt_roll[:, 0] = 1 - gt_roll[:, 1] return gt_roll
def transcribe_file(checkpoint_dir, flac_paths, save_path, sequence_length, onset_threshold, frame_threshold): # Create default model and optimizer even though they'll be replaced with the checkpoint. model = OnsetsAndFrames(MAX_MIDI - MIN_MIDI + 1) optimizer = keras.optimizers.Adam(.0001) ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=optimizer, net=model) manager = tf.train.CheckpointManager(ckpt, checkpoint_dir, max_to_keep=3) ckpt.restore(manager.latest_checkpoint).expect_partial() if manager.latest_checkpoint: tf.print("Restored from {}".format(manager.latest_checkpoint)) globbed_paths = glob.glob(flac_paths) # do a transcription just to be able to call model.summary() # audio = load_and_process_audio(globbed_paths[0], sequence_length) # audio = tf.expand_dims(audio, 0) # predictions = transcribe(model, audio) # model.summary() for flac_path in globbed_paths: print(f'Processing FLAC: {flac_path}', file=sys.stderr) audio = load_and_process_audio(flac_path, sequence_length) audio = tf.expand_dims(audio, 0) predictions = transcribe(model, audio) p_est, i_est, v_est = extract_notes(predictions['onset'], predictions['frame'], predictions['velocity'], onset_threshold, frame_threshold) scaling = HOP_LENGTH / SAMPLE_RATE i_est = (i_est * scaling).reshape((-1, 2)) p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est]) os.makedirs(save_path, exist_ok=True) midi_path = os.path.join(save_path, os.path.basename(flac_path) + '.pred.mid') save_midi(midi_path, p_est, i_est, v_est) pred_path = os.path.join(save_path, os.path.basename(flac_path) + '.pred.png') save_pianoroll(pred_path, predictions['onset'], predictions['frame'])
def note_transcribe_per_min(input_file_name, output_file_name, model_path, ensemble, hop_size, sr, onset_threshold, frame_threshold, export_midi, device): """ Transcribe piano notes with deep learning model and write in lines with (onset offset F0) form or MIDI file. """ audio, _ = librosa.load(input_file_name, sr=sr) # Protection code for audio > 1 if np.max(np.abs(audio)) > 1: audio = audio / np.max(np.abs(audio)) audio_tensor = torch.from_numpy(audio).to(device).unsqueeze(0) melspec = MelSpectrogram(N_MELS, SAMPLE_RATE, WINDOW_LENGTH, HOP_LENGTH, mel_fmin=MEL_FMIN, mel_fmax=MEL_FMAX).to(device) mel = (melspec(audio_tensor.reshape( -1, audio_tensor.shape[-1])[:, :-1]).transpose(-1, -2)) model = load_transcriber(model_path).to(device).eval() pred = model(mel) print('onset_pred:{}, frame_pred:{}'.format(onset_pred, frame_pred)) p_est, i_est, v_est = extract_notes(onset_pred, frame_pred, vel_pred, onset_threshold, frame_threshold) if export_midi: scaling = HOP_LENGTH / SAMPLE_RATE i_est = (i_est * scaling).reshape(-1, 2) p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est]) filename, file_extension = os.path.splitext(output_file_name) onesec_output_file_name = filename + '_' + str( i) + 'sec' + file_extension print( 'onesec_output_file_name:{}, scaling:{}\n len(i_est):{}, i_est:{}, len(p_est):{}, p_est:{}' .format(onesec_output_file_name, scaling, len(i_est), i_est, len(p_est), p_est)) save_midi(onesec_output_file_name, p_est, i_est, v_est) else: _write_tsv(p_est, i_est, output_file_name, hop_size, sr)
def evaluate(data, model, onset_threshold=0.5, frame_threshold=0.5, save_path=None): metrics = defaultdict(list) for label in data: pred, losses = model.run_on_batch(label) for key, loss in losses.items(): metrics[key].append(loss.item()) for key, value in pred.items(): value.squeeze_(0).relu_() p_ref, i_ref, v_ref = extract_notes(label['onset'], label['frame'], label['velocity']) p_est, i_est, v_est = extract_notes(pred['onset'], pred['frame'], pred['velocity'], onset_threshold, frame_threshold) t_ref, f_ref = notes_to_frames(p_ref, i_ref, label['frame'].shape) t_est, f_est = notes_to_frames(p_est, i_est, pred['frame'].shape) scaling = HOP_LENGTH / SAMPLE_RATE i_ref = (i_ref * scaling).reshape(-1, 2) p_ref = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_ref]) i_est = (i_est * scaling).reshape(-1, 2) p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est]) t_ref = t_ref.astype(np.float64) * scaling f_ref = [np.array([midi_to_hz(MIN_MIDI + midi) for midi in freqs]) for freqs in f_ref] t_est = t_est.astype(np.float64) * scaling f_est = [np.array([midi_to_hz(MIN_MIDI + midi) for midi in freqs]) for freqs in f_est] p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est, offset_ratio=None) metrics['metric/note/precision'].append(p) metrics['metric/note/recall'].append(r) metrics['metric/note/f1'].append(f) metrics['metric/note/overlap'].append(o) p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est) metrics['metric/note-with-offsets/precision'].append(p) metrics['metric/note-with-offsets/recall'].append(r) metrics['metric/note-with-offsets/f1'].append(f) metrics['metric/note-with-offsets/overlap'].append(o) p, r, f, o = evaluate_notes_with_velocity(i_ref, p_ref, v_ref, i_est, p_est, v_est, offset_ratio=None, velocity_tolerance=0.1) metrics['metric/note-with-velocity/precision'].append(p) metrics['metric/note-with-velocity/recall'].append(r) metrics['metric/note-with-velocity/f1'].append(f) metrics['metric/note-with-velocity/overlap'].append(o) p, r, f, o = evaluate_notes_with_velocity(i_ref, p_ref, v_ref, i_est, p_est, v_est, velocity_tolerance=0.1) metrics['metric/note-with-offsets-and-velocity/precision'].append(p) metrics['metric/note-with-offsets-and-velocity/recall'].append(r) metrics['metric/note-with-offsets-and-velocity/f1'].append(f) metrics['metric/note-with-offsets-and-velocity/overlap'].append(o) frame_metrics = evaluate_frames(t_ref, f_ref, t_est, f_est) metrics['metric/frame/f1'].append(hmean([frame_metrics['Precision'] + eps, frame_metrics['Recall'] + eps]) - eps) for key, loss in frame_metrics.items(): metrics['metric/frame/' + key.lower().replace(' ', '_')].append(loss) if save_path is not None: os.makedirs(save_path, exist_ok=True) label_path = os.path.join(save_path, os.path.basename(label['path']) + '.label.png') save_pianoroll(label_path, label['onset'], label['frame']) pred_path = os.path.join(save_path, os.path.basename(label['path']) + '.pred.png') save_pianoroll(pred_path, pred['onset'], pred['frame']) midi_path = os.path.join(save_path, os.path.basename(label['path']) + '.pred.mid') save_midi(midi_path, p_est, i_est, v_est) return metrics
def note_transcribe_per_min(input_file_name, output_file_name, model_path, ensemble, hop_size, sr, onset_threshold, frame_threshold, export_midi, device): """ Transcribe piano notes with deep learning model and write in lines with (onset offset F0) form or MIDI file. """ audio, _ = librosa.load(input_file_name, sr=sr) # Protection code for audio > 1 if np.max(np.abs(audio)) > 1: audio = audio / np.max(np.abs(audio)) dur_music_sec = math.floor(len(audio)/sr) for i in range(round(dur_music_sec)): print('i:{}, dur_music_sec:{}'.format(i, dur_music_sec)) print('audio.size():{}, type(audio):{}, sr:{}'.format( audio.shape, type(audio), sr)) onesec_audio = audio[i*sr:(i+19)*sr] print('type(onesec_audio):{}, len(onesec_audio):{}'.format(type(onesec_audio), onesec_audio.shape)) audio_tensor = torch.from_numpy(onesec_audio).to(device).unsqueeze(0) melspec = MelSpectrogram(N_MELS, SAMPLE_RATE, WINDOW_LENGTH, HOP_LENGTH, mel_fmin=MEL_FMIN, mel_fmax=MEL_FMAX).to(device) mel = (melspec(audio_tensor .reshape(-1, audio_tensor.shape[-1])[:, :-1]) .transpose(-1, -2)) if ensemble is not None: print('mel.shape:{}'.format(mel.shape)) model_paths = list(Path(model_path).glob('*.trm')) onset_pred = torch.zeros(mel.shape[0], mel.shape[1], MAX_MIDI - MIN_MIDI + 1).to(device) frame_pred = torch.zeros(mel.shape[0], mel.shape[1], MAX_MIDI - MIN_MIDI + 1).to(device) vel_pred = torch.zeros(mel.shape[0], mel.shape[1], MAX_MIDI - MIN_MIDI + 1).to(device) for model_path in model_paths: model = load_transcriber(model_path).to(device).eval() onset_pred_part, _, _, frame_pred_part, vel_pred_part = model(mel) if ensemble == 'mean': onset_pred += onset_pred_part / len(model_paths) frame_pred += frame_pred_part / len(model_paths) vel_pred += vel_pred_part / len(model_paths) elif ensemble == 'vote': # extract_notes does not use offset. -> mean onset_pred += ((onset_pred_part > onset_threshold) .type(torch.float)) frame_pred += ((frame_pred_part > frame_threshold) .type(torch.float)) vel_pred += vel_pred_part del model else: model = load_transcriber(model_path).to(device).eval() onset_pred, offset_pred, _, frame_pred, vel_pred = model(mel) onset_pred = onset_pred.squeeze() frame_pred = frame_pred.squeeze() vel_pred = vel_pred.squeeze() print('onset_pred:{}, frame_pred:{}'.format(onset_pred, frame_pred)) p_est, i_est, v_est = extract_notes(onset_pred, frame_pred, vel_pred, onset_threshold, frame_threshold) if export_midi: scaling = HOP_LENGTH / SAMPLE_RATE i_est = (i_est * scaling).reshape(-1, 2) p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est]) filename, file_extension = os.path.splitext(output_file_name) onesec_output_file_name = filename + '_' + str(i) + 'sec' + file_extension print('onesec_output_file_name:{}, scaling:{}\n len(i_est):{}, i_est:{}, len(p_est):{}, p_est:{}'.format(onesec_output_file_name, scaling, len(i_est), i_est, len(p_est), p_est)) save_midi(onesec_output_file_name, p_est, i_est, v_est) else: _write_tsv(p_est, i_est, output_file_name, hop_size, sr)
def evaluate(metrics, model, inputs, targets, onset_threshold=0.5, frame_threshold=0.5, save_path=None): # NB: this can't be decorated with tf.function because of all the extract_notes functions not being pure TF code. mel = audio_to_mel(inputs) onset_pred, offset_pred, frame_pred, velocity_pred = model(mel, training=False) onset_labels, offset_labels, frame_labels, velocity_labels, path_labels = targets # for key, loss in losses.items(): # metrics[key].append(loss.item()) # todo: add loss metrics # We're working with batch size of 1, so remove the first index for everything. onset_pred = tf.squeeze(onset_pred) offset_pred = tf.squeeze(offset_pred) frame_pred = tf.squeeze(frame_pred) velocity_pred = tf.squeeze(velocity_pred) onset_labels = tf.squeeze(onset_labels) offset_labels = tf.squeeze(offset_labels) frame_labels = tf.squeeze(frame_labels) velocity_labels = tf.squeeze(velocity_labels) path_labels = tf.squeeze(path_labels).numpy().decode("utf-8") p_ref, i_ref, v_ref = extract_notes(onset_labels, frame_labels, velocity_labels) p_est, i_est, v_est = extract_notes(onset_pred, frame_pred, velocity_pred, onset_threshold, frame_threshold) t_ref, f_ref = notes_to_frames(p_ref, i_ref, frame_labels.shape) t_est, f_est = notes_to_frames(p_est, i_est, frame_pred.shape) scaling = HOP_LENGTH / SAMPLE_RATE i_ref = (i_ref * scaling).reshape(-1, 2) p_ref = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_ref]) i_est = (i_est * scaling).reshape(-1, 2) p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est]) t_ref = t_ref.astype(np.float64) * scaling f_ref = [ np.array([midi_to_hz(MIN_MIDI + midi) for midi in freqs]) for freqs in f_ref ] t_est = t_est.astype(np.float64) * scaling f_est = [ np.array([midi_to_hz(MIN_MIDI + midi) for midi in freqs]) for freqs in f_est ] p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est, offset_ratio=None) metrics['metric/note/precision'].append(p) metrics['metric/note/recall'].append(r) metrics['metric/note/f1'].append(f) metrics['metric/note/overlap'].append(o) p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est) metrics['metric/note-with-offsets/precision'].append(p) metrics['metric/note-with-offsets/recall'].append(r) metrics['metric/note-with-offsets/f1'].append(f) metrics['metric/note-with-offsets/overlap'].append(o) p, r, f, o = evaluate_notes_with_velocity(i_ref, p_ref, v_ref, i_est, p_est, v_est, offset_ratio=None, velocity_tolerance=0.1) metrics['metric/note-with-velocity/precision'].append(p) metrics['metric/note-with-velocity/recall'].append(r) metrics['metric/note-with-velocity/f1'].append(f) metrics['metric/note-with-velocity/overlap'].append(o) p, r, f, o = evaluate_notes_with_velocity(i_ref, p_ref, v_ref, i_est, p_est, v_est, velocity_tolerance=0.1) metrics['metric/note-with-offsets-and-velocity/precision'].append(p) metrics['metric/note-with-offsets-and-velocity/recall'].append(r) metrics['metric/note-with-offsets-and-velocity/f1'].append(f) metrics['metric/note-with-offsets-and-velocity/overlap'].append(o) frame_metrics = evaluate_frames(t_ref, f_ref, t_est, f_est) metrics['metric/frame/f1'].append( hmean( [frame_metrics['Precision'] + eps, frame_metrics['Recall'] + eps]) - eps) for key, loss in frame_metrics.items(): metrics['metric/frame/' + key.lower().replace(' ', '_')].append(loss) if save_path is not None: os.makedirs(save_path, exist_ok=True) label_path = os.path.join(save_path, os.path.basename(path_labels) + '.label.png') save_pianoroll(label_path, onset_labels, frame_labels) pred_path = os.path.join(save_path, os.path.basename(path_labels) + '.pred.png') save_pianoroll(pred_path, onset_pred, frame_pred) midi_path = os.path.join(save_path, os.path.basename(path_labels) + '.pred.mid') save_midi(midi_path, p_est, i_est, v_est) return metrics
def evaluate(model, batch, device, save=False, save_path=None): metrics = defaultdict(list) batch = allocate_batch(batch, device) frame_logit, onset_logit = model(batch['audio']) criterion = nn.BCEWithLogitsLoss() frame_loss = criterion(frame_logit, batch['frame']) onset_loss = criterion(frame_logit, batch['onset']) metrics['metric/loss/frame_loss'].append(frame_loss.cpu().numpy()) metrics['metric/loss/onset_loss'].append(onset_loss.cpu().numpy()) for n in range(batch['audio'].shape[0]): frame_pred = th.sigmoid(frame_logit[n]) onset_pred = th.sigmoid(onset_logit[n]) pr, re, f1 = framewise_eval(frame_pred, batch['frame'][n]) metrics['metric/frame/frame_precision'].append(pr) metrics['metric/frame/frame_recall'].append(re) metrics['metric/frame/frame_f1'].append(f1) pr, re, f1 = framewise_eval(onset_pred, batch['onset'][n]) metrics['metric/frame/onset_precision'].append(pr) metrics['metric/frame/onset_recall'].append(re) metrics['metric/frame/onset_f1'].append(f1) p_est, i_est = extract_notes(onset_pred, frame_pred) p_ref, i_ref = extract_notes(batch['onset'][n], batch['frame'][n]) scaling = HOP_SIZE / SAMPLE_RATE i_ref = (i_ref * scaling).reshape(-1, 2) p_ref = np.array([midi_to_hz(MIN_MIDI + pitch) for pitch in p_ref]) i_est = (i_est * scaling).reshape(-1, 2) p_est = np.array([midi_to_hz(MIN_MIDI + pitch) for pitch in p_est]) p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est, offset_ratio=None) metrics['metric/note/precision'].append(p) metrics['metric/note/recall'].append(r) metrics['metric/note/f1'].append(f) metrics['metric/note/overlap'].append(o) p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est) metrics['metric/note-with-offsets/precision'].append(p) metrics['metric/note-with-offsets/recall'].append(r) metrics['metric/note-with-offsets/f1'].append(f) metrics['metric/note-with-offsets/overlap'].append(o) if save: if len(p_est) == 0: print( f'no onset detected. skip: {Path(batch["path"][n]).stem}') midi_filename = Path(save_path) / (Path(batch['path'][n]).stem + '.midi') save_midi(midi_filename, p_est, i_est, [64] * len(p_est)) wav_filename = Path(save_path) / (Path(batch['path'][n]).stem + '.wav') midi_file = pretty_midi.PrettyMIDI(str(midi_filename)) synth_audio = midi_file.fluidsynth(fs=16000) soundfile.write(wav_filename, synth_audio, 16000) return metrics
def evaluate_onf(batch, model, device, save_path=None, criterion=None, sampling_method='argmax', rep_type='base', plot_example=False, recursive=True, detail_eval=False, delay=1): metrics = defaultdict(list) with th.no_grad(): preds, losses = models.run_on_batch_onf(model, batch, device[0]) losses = losses.cpu().numpy() metrics['loss'].extend([losses]) for n in range(preds['frame'].shape[0]): label = dict() for key in batch: label[key] = batch[key][n] onset_ref, offset_ref, frame_ref = representation.base2onsets_and_frames( label['shifted_label'][delay:]) onsets = preds['onset'][n] > 0.5 offsets = preds['offset'][n] > 0.5 frames = preds['frame'][n] > 0.5 p_ref, i_ref, v_ref = extract_notes(onset_ref, frame_ref) p_est, i_est, v_est = extract_notes(onsets, frames) t_ref, f_ref = notes_to_frames(p_ref, i_ref, frame_ref.shape) t_est, f_est = notes_to_frames(p_est, i_est, frames.shape) scaling = HOP_LENGTH / SAMPLE_RATE i_ref = (i_ref * scaling).reshape(-1, 2) p_ref = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_ref]) i_est = (i_est * scaling).reshape(-1, 2) p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est]) t_ref = t_ref.astype(np.float64) * scaling f_ref = [ np.array([midi_to_hz(MIN_MIDI + midi) for midi in freqs]) for freqs in f_ref ] t_est = t_est.astype(np.float64) * scaling f_est = [ np.array([midi_to_hz(MIN_MIDI + midi) for midi in freqs]) for freqs in f_est ] p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est, offset_ratio=None) metrics['metric/note/precision'].append(p) metrics['metric/note/recall'].append(r) metrics['metric/note/f1'].append(f) metrics['metric/note/overlap'].append(o) p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est) metrics['metric/note-with-offsets/precision'].append(p) metrics['metric/note-with-offsets/recall'].append(r) metrics['metric/note-with-offsets/f1'].append(f) metrics['metric/note-with-offsets/overlap'].append(o) frame_metrics = evaluate_frames(t_ref, f_ref, t_est, f_est) metrics['metric/frame/f1'].append( hmean([ frame_metrics['Precision'] + eps, frame_metrics['Recall'] + eps ]) - eps) for key, value in frame_metrics.items(): metrics['metric/frame/' + key.lower().replace(' ', '_')].append(value) return metrics, None
def evaluate(batch, model, device, save_path=None, criterion=None, sampling_method='argmax', rep_type='base', plot_example=False, recursive=True, detail_eval=False, delay=1): # TODO: input: prediction & label. output: metric metrics = defaultdict(list) acc_conf = [] if sampling_method == 'argmax': gt_ratio = 0.0 elif sampling_method == 'gt': gt_ratio = 1.0 else: gt_ratio = 0.0 with th.no_grad(): preds, losses = models.run_on_batch(model, batch, device[0], sampling_method=sampling_method, gt_ratio=gt_ratio, criterion=criterion, rep_type=rep_type, recursive=recursive, delay=delay) losses = losses.cpu().numpy() metrics['loss'].extend(list(np.atleast_1d(losses))) for n in range(preds.shape[0]): label = dict() pred = preds[n] argmax_pred = pred.argmax(dim=0) for key in batch: label[key] = batch[key][n] if detail_eval: acc_conf.append( calculate_acc_conf( pred.cpu().numpy().transpose((1, 2, 0)), label['shifted_label'][delay:].cpu().numpy())) else: acc_conf.append(None) onset_ref, offset_ref, frame_ref = representation.base2onsets_and_frames( label['shifted_label'][delay:]) onsets, offsets, frames = representation.convert2onsets_and_frames( argmax_pred, rep_type) p_ref, i_ref, v_ref = extract_notes(onset_ref, frame_ref) p_est, i_est, v_est = extract_notes(onsets, frames) t_ref, f_ref = notes_to_frames(p_ref, i_ref, frame_ref.shape) t_est, f_est = notes_to_frames(p_est, i_est, frames.shape) scaling = HOP_LENGTH / SAMPLE_RATE i_ref = (i_ref * scaling).reshape(-1, 2) p_ref = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_ref]) i_est = (i_est * scaling).reshape(-1, 2) p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est]) t_ref = t_ref.astype(np.float64) * scaling f_ref = [ np.array([midi_to_hz(MIN_MIDI + midi) for midi in freqs]) for freqs in f_ref ] t_est = t_est.astype(np.float64) * scaling f_est = [ np.array([midi_to_hz(MIN_MIDI + midi) for midi in freqs]) for freqs in f_est ] p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est, offset_ratio=None) metrics['metric/note/precision'].append(p) metrics['metric/note/recall'].append(r) metrics['metric/note/f1'].append(f) metrics['metric/note/overlap'].append(o) p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est) metrics['metric/note-with-offsets/precision'].append(p) metrics['metric/note-with-offsets/recall'].append(r) metrics['metric/note-with-offsets/f1'].append(f) metrics['metric/note-with-offsets/overlap'].append(o) frame_metrics = evaluate_frames(t_ref, f_ref, t_est, f_est) metrics['metric/frame/f1'].append( hmean([ frame_metrics['Precision'] + eps, frame_metrics['Recall'] + eps ]) - eps) for key, value in frame_metrics.items(): metrics['metric/frame/' + key.lower().replace(' ', '_')].append(value) if plot_example: pred = pred.cpu().numpy().transpose(1, 2, 0) label = label['shifted_label'][delay:].cpu().numpy() os.makedirs(save_path, exist_ok=True) basename = Path(save_path) / Path(batch['path'][n]).stem np.save(str(basename) + f'_label.npy', label) np.save(str(basename) + f'_pred_{sampling_method}.npy', pred) draw_predictions_with_label( str(basename) + f'_pred.png', pred, label) # midi_path = str(basename) + f'_pred_{global_step}.mid' # save_midi(midi_path, p_est, i_est, v_est) return metrics, acc_conf
def note_transcribe(input_file_name, output_file_name, model_path, ensemble, hop_size, sr, onset_threshold, frame_threshold, export_midi, device): """ Transcribe piano notes with deep learning model and write in lines with (onset offset F0) form or MIDI file. """ audio, _ = librosa.load(input_file_name, sr=sr) # Protection code for audio > 1 if np.max(np.abs(audio)) > 1: audio = audio / np.max(np.abs(audio)) audio_tensor = torch.from_numpy(audio).to(device).unsqueeze(0) melspec = MelSpectrogram(N_MELS, SAMPLE_RATE, WINDOW_LENGTH, HOP_LENGTH, mel_fmin=MEL_FMIN, mel_fmax=MEL_FMAX).to(device) mel = (melspec(audio_tensor.reshape( -1, audio_tensor.shape[-1])[:, :-1]).transpose(-1, -2)) if ensemble is not None: model_paths = list(Path(model_path).glob('*.trm')) onset_pred = torch.zeros(mel.shape[0], mel.shape[1], MAX_MIDI - MIN_MIDI + 1).to(device) frame_pred = torch.zeros(mel.shape[0], mel.shape[1], MAX_MIDI - MIN_MIDI + 1).to(device) vel_pred = torch.zeros(mel.shape[0], mel.shape[1], MAX_MIDI - MIN_MIDI + 1).to(device) for model_path in model_paths: model = load_transcriber(model_path).to(device).eval() onset_pred_part, _, _, frame_pred_part, vel_pred_part = model(mel) if ensemble == 'mean': onset_pred += onset_pred_part / len(model_paths) frame_pred += frame_pred_part / len(model_paths) vel_pred += vel_pred_part / len(model_paths) elif ensemble == 'vote': # extract_notes does not use offset. -> mean onset_pred += ((onset_pred_part > onset_threshold).type( torch.float)) frame_pred += ((frame_pred_part > frame_threshold).type( torch.float)) vel_pred += vel_pred_part del model else: model = load_transcriber(model_path).to(device).eval() onset_pred, offset_pred, _, frame_pred, vel_pred = model(mel) onset_pred = onset_pred.squeeze() frame_pred = frame_pred.squeeze() vel_pred = vel_pred.squeeze() p_est, i_est, v_est = extract_notes(onset_pred, frame_pred, vel_pred, onset_threshold, frame_threshold) if export_midi: scaling = HOP_LENGTH / SAMPLE_RATE i_est = (i_est * scaling).reshape(-1, 2) p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est]) save_midi(output_file_name, p_est, i_est, v_est) else: _write_tsv(p_est, i_est, output_file_name, hop_size, sr)
def evaluate(data, model, logging_info, onset_threshold=0.5, frame_threshold=0.5, save_path=None): metrics = defaultdict(list) song_names = list() for label in data: song_names.append((label['path']).split("/")[-1:]) pred, losses = model.run_on_batch(label) for key, loss in losses.items(): metrics[key].append(loss.item()) for key, value in pred.items(): value.squeeze_(0).relu_() p_ref, i_ref, v_ref = extract_notes(label['onset'], label['frame'], label['velocity']) p_est, i_est, v_est = extract_notes(pred['onset'], pred['frame'], pred['velocity'], onset_threshold, frame_threshold) t_ref, f_ref = notes_to_frames(p_ref, i_ref, label['frame'].shape) t_est, f_est = notes_to_frames(p_est, i_est, pred['frame'].shape) scaling = HOP_LENGTH / SAMPLE_RATE i_ref = (i_ref * scaling).reshape(-1, 2) p_ref = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_ref]) i_est = (i_est * scaling).reshape(-1, 2) p_est = np.array([midi_to_hz(MIN_MIDI + midi) for midi in p_est]) t_ref = t_ref.astype(np.float64) * scaling f_ref = [np.array([midi_to_hz(MIN_MIDI + midi) for midi in freqs]) for freqs in f_ref] t_est = t_est.astype(np.float64) * scaling f_est = [np.array([midi_to_hz(MIN_MIDI + midi) for midi in freqs]) for freqs in f_est] p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est, offset_ratio=None) metrics['metric/note/precision'].append(p) metrics['metric/note/recall'].append(r) metrics['metric/note/f1'].append(f) metrics['metric/note/overlap'].append(o) p, r, f, o = evaluate_notes(i_ref, p_ref, i_est, p_est) metrics['metric/note-with-offsets/precision'].append(p) metrics['metric/note-with-offsets/recall'].append(r) metrics['metric/note-with-offsets/f1'].append(f) metrics['metric/note-with-offsets/overlap'].append(o) p, r, f, o = evaluate_notes_with_velocity(i_ref, p_ref, v_ref, i_est, p_est, v_est, offset_ratio=None, velocity_tolerance=0.1) metrics['metric/note-with-velocity/precision'].append(p) metrics['metric/note-with-velocity/recall'].append(r) metrics['metric/note-with-velocity/f1'].append(f) metrics['metric/note-with-velocity/overlap'].append(o) p, r, f, o = evaluate_notes_with_velocity(i_ref, p_ref, v_ref, i_est, p_est, v_est, velocity_tolerance=0.1) metrics['metric/note-with-offsets-and-velocity/precision'].append(p) metrics['metric/note-with-offsets-and-velocity/recall'].append(r) metrics['metric/note-with-offsets-and-velocity/f1'].append(f) metrics['metric/note-with-offsets-and-velocity/overlap'].append(o) frame_metrics = evaluate_frames(t_ref, f_ref, t_est, f_est) metrics['metric/frame/f1'].append(hmean([frame_metrics['Precision'] + eps, frame_metrics['Recall'] + eps]) - eps) for key, loss in frame_metrics.items(): metrics['metric/frame/' + key.lower().replace(' ', '_')].append(loss) if save_path is not None: os.makedirs(save_path, exist_ok=True) label_path = os.path.join(save_path, os.path.basename(label['path']) + '.label.png') save_pianoroll(label_path, label['onset'], label['frame']) pred_path = os.path.join(save_path, os.path.basename(label['path']) + '.pred.png') save_pianoroll(pred_path, pred['onset'], pred['frame']) midi_path = os.path.join(save_path, os.path.basename(label['path']) + '.pred.mid') save_midi(midi_path, p_est, i_est, v_est) # Creating a table of results for each song, and sorting by note_with_offset F1 score rename_dict = dict() for key, _ in metrics.items(): if key.startswith('loss/'): _, category = key.split('/') rename_dict[key] = category + " loss" if key.startswith('metric/'): _, category, name = key.split('/') rename_dict[key] = category + " " + name model_file, dataset_name = logging_info log_str = (model_file + "_" + dataset_name).replace("/", "-") evaluation_by_song_df = pd.DataFrame.from_dict(metrics, orient='index').transpose() evaluation_by_song_df.insert(0, "song_name", song_names) evaluation_by_song_df["song_name"] = song_names evaluation_by_song_df.rename(columns=rename_dict, inplace=True) evaluation_by_song_df.sort_values("note-with-offsets f1", ascending=True) evaluation_by_song_df.to_csv("./evaluations/new_evals" + log_str+ "_by_song.csv", index=False) model_df = evaluation_by_song_df.mean() model_df.to_csv("./evaluations/new_evals/new_evals" + log_str+ "_model.csv", index=False) return metrics