def _sequence_to_pianoroll_fn(sequence_tensor, velocity_range_tensor, instrument_family=None, hparams=None): """Converts sequence to pianorolls.""" if instrument_family is not None and instrument_family < 0: instrument_family = None velocity_range = music_pb2.VelocityRange.FromString( velocity_range_tensor.numpy()) sequence = music_pb2.NoteSequence.FromString(sequence_tensor.numpy()) sequence = sequences_lib.apply_sustain_control_changes(sequence) roll = sequences_lib.sequence_to_pianoroll( sequence, frames_per_second=hparams_frames_per_second(hparams), min_pitch=constants.MIN_MIDI_PITCH, max_pitch=constants.MAX_MIDI_PITCH, min_frame_occupancy_for_label=hparams.min_frame_occupancy_for_label, onset_mode=hparams.onset_mode, onset_length_ms=hparams.onset_length, offset_length_ms=hparams.offset_length, onset_delay_ms=hparams.onset_delay, min_velocity=velocity_range.min, max_velocity=velocity_range.max, instrument_family=instrument_family, use_drums=hparams.use_drums, timbre_num_classes=hparams.timbre_num_classes) return (roll.active, roll.weights, roll.onsets, roll.onset_velocities, roll.offsets)
def _ExampleToInputs(self, ex, truncated_length=0, crop_training_sequence_to_notes=False): hparams = copy.deepcopy(constants.DEFAULT_HPARAMS) hparams.crop_training_sequence_to_notes = crop_training_sequence_to_notes filename = ex.features.feature['id'].bytes_list.value[0] sequence, crop_beginning_seconds = data.preprocess_sequence( ex.features.feature['sequence'].bytes_list.value[0], hparams) wav_data = ex.features.feature['audio'].bytes_list.value[0] if crop_training_sequence_to_notes: wav_data = audio_io.crop_wav_data(wav_data, hparams.sample_rate, crop_beginning_seconds, sequence.total_time) spec = data.wav_to_spec(wav_data, hparams=hparams) roll = sequences_lib.sequence_to_pianoroll( sequence, frames_per_second=data.hparams_frames_per_second(hparams), min_pitch=constants.MIN_MIDI_PITCH, max_pitch=constants.MAX_MIDI_PITCH, min_frame_occupancy_for_label=0.0, onset_mode='length_ms', onset_length_ms=32., onset_delay_ms=0.) length = data.wav_to_num_frames( wav_data, frames_per_second=data.hparams_frames_per_second(hparams)) return self._DataToInputs(spec, roll.active, roll.weights, length, filename, truncated_length)
def _ExampleToInputs(self, ex, truncated_length=0, crop_training_sequence_to_notes=False): hparams = copy.deepcopy(constants.DEFAULT_HPARAMS) hparams.crop_training_sequence_to_notes = crop_training_sequence_to_notes filename = ex.features.feature['id'].bytes_list.value[0] sequence, crop_beginning_seconds = data.preprocess_sequence( ex.features.feature['sequence'].bytes_list.value[0], hparams) wav_data = ex.features.feature['audio'].bytes_list.value[0] if crop_training_sequence_to_notes: wav_data = audio_io.crop_wav_data(wav_data, hparams.sample_rate, crop_beginning_seconds, sequence.total_time) spec = data.wav_to_spec(wav_data, hparams=hparams) roll = sequences_lib.sequence_to_pianoroll( sequence, frames_per_second=data.hparams_frames_per_second(hparams), min_pitch=constants.MIN_MIDI_PITCH, max_pitch=constants.MAX_MIDI_PITCH, min_frame_occupancy_for_label=0.0, onset_mode='length_ms', onset_length_ms=32., onset_delay_ms=0.) length = data.wav_to_num_frames( wav_data, frames_per_second=data.hparams_frames_per_second(hparams)) return self._DataToInputs(spec, roll.active, roll.weights, length, filename, truncated_length)
def _ExampleToInputs(self, ex, truncated_length=0): hparams = copy.deepcopy(configs.DEFAULT_HPARAMS) filename = ex.features.feature['id'].bytes_list.value[0] sequence = music_pb2.NoteSequence.FromString( ex.features.feature['sequence'].bytes_list.value[0]) wav_data = ex.features.feature['audio'].bytes_list.value[0] spec = data.wav_to_spec(wav_data, hparams=hparams) roll = sequences_lib.sequence_to_pianoroll( sequence, frames_per_second=data.hparams_frames_per_second(hparams), min_pitch=constants.MIN_MIDI_PITCH, max_pitch=constants.MAX_MIDI_PITCH, min_frame_occupancy_for_label=0.0, onset_mode='length_ms', onset_length_ms=32., onset_delay_ms=0.) length = data.wav_to_num_frames( wav_data, frames_per_second=data.hparams_frames_per_second(hparams)) return self._DataToInputs(spec, roll.active, roll.weights, length, filename, truncated_length)
def _ExampleToInputs(self, ex, truncated_length=0): hparams = copy.deepcopy(configs.DEFAULT_HPARAMS) filename = ex.features.feature['id'].bytes_list.value[0] sequence = music_pb2.NoteSequence.FromString( ex.features.feature['sequence'].bytes_list.value[0]) wav_data = ex.features.feature['audio'].bytes_list.value[0] spec = data.wav_to_spec(wav_data, hparams=hparams) roll = sequences_lib.sequence_to_pianoroll( sequence, frames_per_second=data.hparams_frames_per_second(hparams), min_pitch=constants.MIN_MIDI_PITCH, max_pitch=constants.MAX_MIDI_PITCH, min_frame_occupancy_for_label=0.0, onset_mode='length_ms', onset_length_ms=32., onset_delay_ms=0.) length = data.wav_to_num_frames( wav_data, frames_per_second=data.hparams_frames_per_second(hparams)) return self._DataToInputs(spec, roll.active, roll.weights, length, filename, truncated_length)
def magenta_frame_eval(pred_seq, frame_labels): processed_frame_predictions = sequences_lib.sequence_to_pianoroll( pred_seq, frames_per_second=16000 / 512, min_pitch=21, max_pitch=108).active if processed_frame_predictions.shape[0] < frame_labels.shape[0]: # Pad transcribed frames with silence. pad_length = frame_labels.shape[0] - processed_frame_predictions.shape[ 0] processed_frame_predictions = np.pad(processed_frame_predictions, [(0, pad_length), (0, 0)], 'constant') elif processed_frame_predictions.shape[0] > frame_labels.shape[0]: # Truncate transcribed frames. processed_frame_predictions = ( processed_frame_predictions[:frame_labels.shape[0], :]) frame_metrics = magenta_metrics.calculate_frame_metrics( frame_labels=frame_labels, frame_predictions=processed_frame_predictions) results = defaultdict(list) for key, value in frame_metrics.items(): results[key] = value[0].numpy() return results
def posterior_pianoroll_image(frame_probs, sequence_prediction, frame_labels, frames_per_second, overlap=False): """Create a pianoroll image showing frame posteriors, predictions & labels.""" frame_predictions = sequences_lib.sequence_to_pianoroll( sequence_prediction, frames_per_second=frames_per_second, min_pitch=constants.MIN_MIDI_PITCH, max_pitch=constants.MAX_MIDI_PITCH).active if frame_predictions.shape[0] < frame_labels.shape[0]: # Pad transcribed frames with silence. pad_length = frame_labels.shape[0] - frame_predictions.shape[0] frame_predictions = np.pad( frame_predictions, [(0, pad_length), (0, 0)], 'constant') elif frame_predictions.shape[0] > frame_labels.shape[0]: # Truncate transcribed frames. frame_predictions = frame_predictions[:frame_labels.shape[0], :] pianoroll_img = np.zeros([len(frame_probs), 3 * len(frame_probs[0]), 3]) if overlap: # Show overlap in yellow pianoroll_img[:, :, 0] = np.concatenate( [np.array(frame_labels), np.array(frame_predictions), np.array(frame_probs)], axis=1) pianoroll_img[:, :, 1] = np.concatenate( [np.array(frame_labels), np.array(frame_labels), np.array(frame_labels)], axis=1) pianoroll_img[:, :, 2] = np.concatenate( [np.array(frame_labels), np.zeros_like(frame_predictions), np.zeros_like(np.array(frame_probs))], axis=1) else: # Show only red and green pianoroll_img[:, :, 0] = np.concatenate( [np.array(frame_labels), np.array(frame_predictions) * (1.0 - np.array(frame_labels)), np.array(frame_probs) * (1.0 - np.array(frame_labels))], axis=1) pianoroll_img[:, :, 1] = np.concatenate( [np.array(frame_labels), np.array(frame_predictions) * np.array(frame_labels), np.array(frame_probs) * np.array(frame_labels)], axis=1) pianoroll_img[:, :, 2] = np.concatenate( [np.array(frame_labels), np.zeros_like(frame_predictions), np.zeros_like(np.array(frame_probs))], axis=1) return np.flipud(np.transpose(pianoroll_img, [1, 0, 2]))
def posterior_pianoroll_image(frame_probs, sequence_prediction, frame_labels, frames_per_second, overlap=False): """Create a pianoroll image showing frame posteriors, predictions & labels.""" frame_predictions, _, _, _, _ = sequences_lib.sequence_to_pianoroll( sequence_prediction, frames_per_second=frames_per_second, min_pitch=constants.MIN_MIDI_PITCH, max_pitch=constants.MAX_MIDI_PITCH) if frame_predictions.shape[0] < frame_labels.shape[0]: # Pad transcribed frames with silence. pad_length = frame_labels.shape[0] - frame_predictions.shape[0] frame_predictions = np.pad( frame_predictions, [(0, pad_length), (0, 0)], 'constant') elif frame_predictions.shape[0] > frame_labels.shape[0]: # Truncate transcribed frames. frame_predictions = frame_predictions[:frame_labels.shape[0], :] pianoroll_img = np.zeros([len(frame_probs), 3 * len(frame_probs[0]), 3]) if overlap: # Show overlap in yellow pianoroll_img[:, :, 0] = np.concatenate( [np.array(frame_labels), np.array(frame_predictions), np.array(frame_probs)], axis=1) pianoroll_img[:, :, 1] = np.concatenate( [np.array(frame_labels), np.array(frame_labels), np.array(frame_labels)], axis=1) pianoroll_img[:, :, 2] = np.concatenate( [np.array(frame_labels), np.zeros_like(frame_predictions), np.zeros_like(np.array(frame_probs))], axis=1) else: # Show only red and green pianoroll_img[:, :, 0] = np.concatenate( [np.array(frame_labels), np.array(frame_predictions) * (1.0 - np.array(frame_labels)), np.array(frame_probs) * (1.0 - np.array(frame_labels))], axis=1) pianoroll_img[:, :, 1] = np.concatenate( [np.array(frame_labels), np.array(frame_predictions) * np.array(frame_labels), np.array(frame_probs) * np.array(frame_labels)], axis=1) pianoroll_img[:, :, 2] = np.concatenate( [np.array(frame_labels), np.zeros_like(frame_predictions), np.zeros_like(np.array(frame_probs))], axis=1) return np.flipud(np.transpose(pianoroll_img, [1, 0, 2]))
def sequence_to_pianoroll_fn(sequence_tensor, velocity_range_tensor): velocity_range = music_pb2.VelocityRange.FromString( velocity_range_tensor) sequence = preprocess_sequence(sequence_tensor) return sequences_lib.sequence_to_pianoroll( sequence, frames_per_second=hparams_frames_per_second(hparams), min_pitch=constants.MIN_MIDI_PITCH, max_pitch=constants.MAX_MIDI_PITCH, min_frame_occupancy_for_label=hparams. min_frame_occupancy_for_label, onset_mode=hparams.onset_mode, onset_length_ms=hparams.onset_length, onset_delay_ms=hparams.onset_delay, min_velocity=velocity_range.min, max_velocity=velocity_range.max)
def sequence_to_pianoroll_fn(sequence_tensor, velocity_range_tensor): """Converts sequence to pianorolls.""" velocity_range = music_pb2.VelocityRange.FromString(velocity_range_tensor) sequence = preprocess_sequence(sequence_tensor) roll = sequences_lib.sequence_to_pianoroll( sequence, frames_per_second=hparams_frames_per_second(hparams), min_pitch=constants.MIN_MIDI_PITCH, max_pitch=constants.MAX_MIDI_PITCH, min_frame_occupancy_for_label=hparams.min_frame_occupancy_for_label, onset_mode=hparams.onset_mode, onset_length_ms=hparams.onset_length, offset_length_ms=hparams.offset_length, onset_delay_ms=hparams.onset_delay, min_velocity=velocity_range.min, max_velocity=velocity_range.max) return (roll.active, roll.weights, roll.onsets, roll.offsets, roll.onset_velocities)
def sequence_to_pianoroll_fn(sequence_tensor, velocity_range_tensor): """Converts sequence to pianorolls.""" velocity_range = music_pb2.VelocityRange.FromString(velocity_range_tensor) sequence = music_pb2.NoteSequence.FromString(sequence_tensor) sequence = sequences_lib.apply_sustain_control_changes(sequence) roll = sequences_lib.sequence_to_pianoroll( sequence, frames_per_second=hparams_frames_per_second(hparams), min_pitch=constants.MIN_MIDI_PITCH, max_pitch=constants.MAX_MIDI_PITCH, min_frame_occupancy_for_label=hparams.min_frame_occupancy_for_label, onset_mode=hparams.onset_mode, onset_length_ms=hparams.onset_length, offset_length_ms=hparams.offset_length, onset_delay_ms=hparams.onset_delay, min_velocity=velocity_range.min, max_velocity=velocity_range.max) return (roll.active, roll.weights, roll.onsets, roll.onset_velocities, roll.offsets)
def sequence_to_pianoroll_fn(sequence_tensor, velocity_range_tensor): """Converts sequence to pianorolls.""" velocity_range = music_pb2.VelocityRange.FromString(velocity_range_tensor) sequence, unused_cropped_beginning_seconds = preprocess_sequence( sequence_tensor, hparams) roll = sequences_lib.sequence_to_pianoroll( sequence, frames_per_second=hparams_frames_per_second(hparams), min_pitch=constants.MIN_MIDI_PITCH, max_pitch=constants.MAX_MIDI_PITCH, min_frame_occupancy_for_label=hparams.min_frame_occupancy_for_label, onset_mode=hparams.onset_mode, onset_length_ms=hparams.onset_length, offset_length_ms=hparams.offset_length, onset_delay_ms=hparams.onset_delay, min_velocity=velocity_range.min, max_velocity=velocity_range.max) return (roll.active, roll.weights, roll.onsets, roll.onset_velocities, roll.offsets)
def seq_to_pianoroll(seq): return sequences_lib.sequence_to_pianoroll(seq, frames_per_second=16000 / 512, min_pitch=21, max_pitch=108).active
def score_sequence(session, global_step_increment, summary_op, summary_writer, metrics_to_updates, metric_note_precision, metric_note_recall, metric_note_f1, metric_note_precision_with_offsets, metric_note_recall_with_offsets, metric_note_f1_with_offsets, metric_frame_labels, metric_frame_predictions, frame_labels, sequence_prediction, frames_per_second, note_sequence_str_label, min_duration_ms, sequence_id): """Calculate metrics on the inferred sequence.""" est_intervals, est_pitches = sequence_to_valued_intervals( sequence_prediction, min_duration_ms=min_duration_ms) sequence_label = music_pb2.NoteSequence.FromString(note_sequence_str_label) ref_intervals, ref_pitches = sequence_to_valued_intervals( sequence_label, min_duration_ms=min_duration_ms) sequence_note_precision, sequence_note_recall, sequence_note_f1, _ = ( mir_eval.transcription.precision_recall_f1_overlap( ref_intervals, pretty_midi.note_number_to_hz(ref_pitches), est_intervals, pretty_midi.note_number_to_hz(est_pitches), offset_ratio=None)) (sequence_note_precision_with_offsets, sequence_note_recall_with_offsets, sequence_note_f1_with_offsets, _) = (mir_eval.transcription.precision_recall_f1_overlap( ref_intervals, pretty_midi.note_number_to_hz(ref_pitches), est_intervals, pretty_midi.note_number_to_hz(est_pitches))) frame_predictions = sequences_lib.sequence_to_pianoroll( sequence_prediction, frames_per_second=frames_per_second, min_pitch=constants.MIN_MIDI_PITCH, max_pitch=constants.MAX_MIDI_PITCH).active if frame_predictions.shape[0] < frame_labels.shape[0]: # Pad transcribed frames with silence. pad_length = frame_labels.shape[0] - frame_predictions.shape[0] frame_predictions = np.pad(frame_predictions, [(0, pad_length), (0, 0)], 'constant') elif frame_predictions.shape[0] > frame_labels.shape[0]: # Truncate transcribed frames. frame_predictions = frame_predictions[:frame_labels.shape[0], :] global_step, _ = session.run( [global_step_increment, metrics_to_updates], { metric_frame_predictions: frame_predictions, metric_frame_labels: frame_labels, metric_note_precision: sequence_note_precision, metric_note_recall: sequence_note_recall, metric_note_f1: sequence_note_f1, metric_note_precision_with_offsets: sequence_note_precision_with_offsets, metric_note_recall_with_offsets: sequence_note_recall_with_offsets, metric_note_f1_with_offsets: sequence_note_f1_with_offsets }) # Running the summary op separately ensures that all of the metrics have been # updated before we try to query them. summary = session.run(summary_op) tf.logging.info('Writing score summary for %s: Step= %d, Note F1=%f', sequence_id, global_step, sequence_note_f1) summary_writer.add_summary(summary, global_step) summary_writer.flush() return sequence_label
def score_sequence(session, global_step_increment, summary_op, summary_writer, metrics_to_updates, metric_note_precision, metric_note_recall, metric_note_f1, metric_note_precision_with_offsets, metric_note_recall_with_offsets, metric_note_f1_with_offsets, metric_frame_labels, metric_frame_predictions, frame_labels, sequence_prediction, frames_per_second, note_sequence_str_label, min_duration_ms, sequence_id): """Calculate metrics on the inferred sequence.""" est_intervals, est_pitches = sequence_to_valued_intervals( sequence_prediction, min_duration_ms=min_duration_ms) sequence_label = music_pb2.NoteSequence.FromString(note_sequence_str_label) ref_intervals, ref_pitches = sequence_to_valued_intervals( sequence_label, min_duration_ms=min_duration_ms) sequence_note_precision, sequence_note_recall, sequence_note_f1, _ = ( mir_eval.transcription.precision_recall_f1_overlap( ref_intervals, pretty_midi.note_number_to_hz(ref_pitches), est_intervals, pretty_midi.note_number_to_hz(est_pitches), offset_ratio=None)) (sequence_note_precision_with_offsets, sequence_note_recall_with_offsets, sequence_note_f1_with_offsets, _) = ( mir_eval.transcription.precision_recall_f1_overlap( ref_intervals, pretty_midi.note_number_to_hz(ref_pitches), est_intervals, pretty_midi.note_number_to_hz(est_pitches))) frame_predictions = sequences_lib.sequence_to_pianoroll( sequence_prediction, frames_per_second=frames_per_second, min_pitch=constants.MIN_MIDI_PITCH, max_pitch=constants.MAX_MIDI_PITCH).active if frame_predictions.shape[0] < frame_labels.shape[0]: # Pad transcribed frames with silence. pad_length = frame_labels.shape[0] - frame_predictions.shape[0] frame_predictions = np.pad( frame_predictions, [(0, pad_length), (0, 0)], 'constant') elif frame_predictions.shape[0] > frame_labels.shape[0]: # Truncate transcribed frames. frame_predictions = frame_predictions[:frame_labels.shape[0], :] global_step, _ = session.run([global_step_increment, metrics_to_updates], { metric_frame_predictions: frame_predictions, metric_frame_labels: frame_labels, metric_note_precision: sequence_note_precision, metric_note_recall: sequence_note_recall, metric_note_f1: sequence_note_f1, metric_note_precision_with_offsets: sequence_note_precision_with_offsets, metric_note_recall_with_offsets: sequence_note_recall_with_offsets, metric_note_f1_with_offsets: sequence_note_f1_with_offsets }) # Running the summary op separately ensures that all of the metrics have been # updated before we try to query them. summary = session.run(summary_op) tf.logging.info( 'Writing score summary for %s: Step= %d, Note F1=%f', sequence_id, global_step, sequence_note_f1) summary_writer.add_summary(summary, global_step) summary_writer.flush() return sequence_label
def score_sequence(session, global_step_increment, metrics_to_updates, metric_note_precision, metric_note_recall, metric_note_f1, metric_note_precision_with_offsets, metric_note_recall_with_offsets, metric_note_f1_with_offsets, metric_note_precision_with_offsets_velocity, metric_note_recall_with_offsets_velocity, metric_note_f1_with_offsets_velocity, metric_frame_labels, metric_frame_predictions, frame_labels, sequence_prediction, frames_per_second, sequence_label, sequence_id): """Calculate metrics on the inferred sequence.""" est_intervals, est_pitches, est_velocities = sequence_to_valued_intervals( sequence_prediction) ref_intervals, ref_pitches, ref_velocities = sequence_to_valued_intervals( sequence_label) sequence_note_precision, sequence_note_recall, sequence_note_f1, _ = ( mir_eval.transcription.precision_recall_f1_overlap( ref_intervals, pretty_midi.note_number_to_hz(ref_pitches), est_intervals, pretty_midi.note_number_to_hz(est_pitches), offset_ratio=None)) (sequence_note_precision_with_offsets, sequence_note_recall_with_offsets, sequence_note_f1_with_offsets, _) = (mir_eval.transcription.precision_recall_f1_overlap( ref_intervals, pretty_midi.note_number_to_hz(ref_pitches), est_intervals, pretty_midi.note_number_to_hz(est_pitches))) (sequence_note_precision_with_offsets_velocity, sequence_note_recall_with_offsets_velocity, sequence_note_f1_with_offsets_velocity, _) = (mir_eval.transcription_velocity.precision_recall_f1_overlap( ref_intervals=ref_intervals, ref_pitches=pretty_midi.note_number_to_hz(ref_pitches), ref_velocities=ref_velocities, est_intervals=est_intervals, est_pitches=pretty_midi.note_number_to_hz(est_pitches), est_velocities=est_velocities)) frame_predictions = sequences_lib.sequence_to_pianoroll( sequence_prediction, frames_per_second=frames_per_second, min_pitch=constants.MIN_MIDI_PITCH, max_pitch=constants.MAX_MIDI_PITCH).active if frame_predictions.shape[0] < frame_labels.shape[0]: # Pad transcribed frames with silence. pad_length = frame_labels.shape[0] - frame_predictions.shape[0] frame_predictions = np.pad(frame_predictions, [(0, pad_length), (0, 0)], 'constant') elif frame_predictions.shape[0] > frame_labels.shape[0]: # Truncate transcribed frames. frame_predictions = frame_predictions[:frame_labels.shape[0], :] global_step, _ = session.run( [global_step_increment, metrics_to_updates], { metric_frame_predictions: frame_predictions, metric_frame_labels: frame_labels, metric_note_precision: sequence_note_precision, metric_note_recall: sequence_note_recall, metric_note_f1: sequence_note_f1, metric_note_precision_with_offsets: sequence_note_precision_with_offsets, metric_note_recall_with_offsets: sequence_note_recall_with_offsets, metric_note_f1_with_offsets: sequence_note_f1_with_offsets, metric_note_precision_with_offsets_velocity: sequence_note_precision_with_offsets_velocity, metric_note_recall_with_offsets_velocity: sequence_note_recall_with_offsets_velocity, metric_note_f1_with_offsets_velocity: sequence_note_f1_with_offsets_velocity, }) tf.logging.info('Updating scores for %s: Step= %d, Note F1=%f', sequence_id, global_step, sequence_note_f1)
def _calculate_metrics_py(frame_predictions, onset_predictions, offset_predictions, velocity_values, sequence_label_str, frame_labels, sequence_id, hparams): """Python logic for calculating metrics on a single example.""" tf.logging.info('Calculating metrics for %s with length %d', sequence_id, frame_labels.shape[0]) if not hparams.predict_onset_threshold: onset_predictions = None if not hparams.predict_offset_threshold: offset_predictions = None sequence_prediction = sequences_lib.pianoroll_to_note_sequence( frames=frame_predictions, frames_per_second=data.hparams_frames_per_second(hparams), min_duration_ms=0, min_midi_pitch=constants.MIN_MIDI_PITCH, onset_predictions=onset_predictions, offset_predictions=offset_predictions, velocity_values=velocity_values) sequence_label = music_pb2.NoteSequence.FromString(sequence_label_str) if hparams.backward_shift_amount_ms: def shift_notesequence(ns_time): return ns_time + hparams.backward_shift_amount_ms / 1000. shifted_sequence_label, skipped_notes = ( sequences_lib.adjust_notesequence_times(sequence_label, shift_notesequence)) assert skipped_notes == 0 sequence_label = shifted_sequence_label est_intervals, est_pitches, est_velocities = ( infer_util.sequence_to_valued_intervals(sequence_prediction)) ref_intervals, ref_pitches, ref_velocities = ( infer_util.sequence_to_valued_intervals(sequence_label)) note_precision, note_recall, note_f1, _ = ( mir_eval.transcription.precision_recall_f1_overlap( ref_intervals, pretty_midi.note_number_to_hz(ref_pitches), est_intervals, pretty_midi.note_number_to_hz(est_pitches), offset_ratio=None)) (note_with_offsets_precision, note_with_offsets_recall, note_with_offsets_f1, _) = (mir_eval.transcription.precision_recall_f1_overlap( ref_intervals, pretty_midi.note_number_to_hz(ref_pitches), est_intervals, pretty_midi.note_number_to_hz(est_pitches))) (note_with_offsets_velocity_precision, note_with_offsets_velocity_recall, note_with_offsets_velocity_f1, _) = (mir_eval.transcription_velocity.precision_recall_f1_overlap( ref_intervals=ref_intervals, ref_pitches=pretty_midi.note_number_to_hz(ref_pitches), ref_velocities=ref_velocities, est_intervals=est_intervals, est_pitches=pretty_midi.note_number_to_hz(est_pitches), est_velocities=est_velocities)) processed_frame_predictions = sequences_lib.sequence_to_pianoroll( sequence_prediction, frames_per_second=data.hparams_frames_per_second(hparams), min_pitch=constants.MIN_MIDI_PITCH, max_pitch=constants.MAX_MIDI_PITCH).active if processed_frame_predictions.shape[0] < frame_labels.shape[0]: # Pad transcribed frames with silence. pad_length = frame_labels.shape[0] - processed_frame_predictions.shape[ 0] processed_frame_predictions = np.pad(processed_frame_predictions, [(0, pad_length), (0, 0)], 'constant') elif processed_frame_predictions.shape[0] > frame_labels.shape[0]: # Truncate transcribed frames. processed_frame_predictions = ( processed_frame_predictions[:frame_labels.shape[0], :]) tf.logging.info( 'Metrics for %s: Note F1 %f, Note w/ offsets F1 %f, ' 'Note w/ offsets & velocity: %f', sequence_id, note_f1, note_with_offsets_f1, note_with_offsets_velocity_f1) return (note_precision, note_recall, note_f1, note_with_offsets_precision, note_with_offsets_recall, note_with_offsets_f1, note_with_offsets_velocity_precision, note_with_offsets_velocity_recall, note_with_offsets_velocity_f1, processed_frame_predictions)
def _calculate_metrics_py(frame_probs, onset_probs, frame_predictions, onset_predictions, offset_predictions, velocity_values, sequence_label_str, frame_labels, sequence_id, hparams, min_pitch, max_pitch, onsets_only, restrict_to_pitch=None): """Python logic for calculating metrics on a single example.""" tf.logging.info('Calculating metrics for %s with length %d', sequence_id, frame_labels.shape[0]) sequence_prediction = infer_util.predict_sequence( frame_probs=frame_probs, onset_probs=onset_probs, frame_predictions=frame_predictions, onset_predictions=onset_predictions, offset_predictions=offset_predictions, velocity_values=velocity_values, min_pitch=min_pitch, hparams=hparams, onsets_only=onsets_only) sequence_label = music_pb2.NoteSequence.FromString(sequence_label_str) if hparams.backward_shift_amount_ms: def shift_notesequence(ns_time): return ns_time + hparams.backward_shift_amount_ms / 1000. shifted_sequence_label, skipped_notes = ( sequences_lib.adjust_notesequence_times(sequence_label, shift_notesequence)) assert skipped_notes == 0 sequence_label = shifted_sequence_label est_intervals, est_pitches, est_velocities = ( sequence_to_valued_intervals( sequence_prediction, restrict_to_pitch=restrict_to_pitch)) ref_intervals, ref_pitches, ref_velocities = ( sequence_to_valued_intervals( sequence_label, restrict_to_pitch=restrict_to_pitch)) processed_frame_predictions = sequences_lib.sequence_to_pianoroll( sequence_prediction, frames_per_second=data.hparams_frames_per_second(hparams), min_pitch=min_pitch, max_pitch=max_pitch).active if processed_frame_predictions.shape[0] < frame_labels.shape[0]: # Pad transcribed frames with silence. pad_length = frame_labels.shape[0] - processed_frame_predictions.shape[0] processed_frame_predictions = np.pad(processed_frame_predictions, [(0, pad_length), (0, 0)], 'constant') elif processed_frame_predictions.shape[0] > frame_labels.shape[0]: # Truncate transcribed frames. processed_frame_predictions = ( processed_frame_predictions[:frame_labels.shape[0], :]) if len(ref_pitches) == 0: tf.logging.info( 'Reference pitches were length 0, returning empty metrics for %s:', sequence_id) return tuple([[]] * 12 + [processed_frame_predictions]) note_precision, note_recall, note_f1, _ = ( mir_eval.transcription.precision_recall_f1_overlap( ref_intervals, pretty_midi.note_number_to_hz(ref_pitches), est_intervals, pretty_midi.note_number_to_hz(est_pitches), offset_ratio=None)) (note_with_velocity_precision, note_with_velocity_recall, note_with_velocity_f1, _) = ( mir_eval.transcription_velocity.precision_recall_f1_overlap( ref_intervals=ref_intervals, ref_pitches=pretty_midi.note_number_to_hz(ref_pitches), ref_velocities=ref_velocities, est_intervals=est_intervals, est_pitches=pretty_midi.note_number_to_hz(est_pitches), est_velocities=est_velocities, offset_ratio=None)) (note_with_offsets_precision, note_with_offsets_recall, note_with_offsets_f1, _) = ( mir_eval.transcription.precision_recall_f1_overlap( ref_intervals, pretty_midi.note_number_to_hz(ref_pitches), est_intervals, pretty_midi.note_number_to_hz(est_pitches))) (note_with_offsets_velocity_precision, note_with_offsets_velocity_recall, note_with_offsets_velocity_f1, _) = ( mir_eval.transcription_velocity.precision_recall_f1_overlap( ref_intervals=ref_intervals, ref_pitches=pretty_midi.note_number_to_hz(ref_pitches), ref_velocities=ref_velocities, est_intervals=est_intervals, est_pitches=pretty_midi.note_number_to_hz(est_pitches), est_velocities=est_velocities)) tf.logging.info( 'Metrics for %s: Note F1 %f, Note w/ velocity F1 %f, Note w/ offsets F1 %f, ' 'Note w/ offsets & velocity: %f', sequence_id, note_f1, note_with_velocity_f1, note_with_offsets_f1, note_with_offsets_velocity_f1) # Return 1-d tensors for the metrics return ([note_precision], [note_recall], [note_f1], [note_with_velocity_precision], [note_with_velocity_recall], [note_with_velocity_f1], [note_with_offsets_precision], [note_with_offsets_recall], [note_with_offsets_f1 ], [note_with_offsets_velocity_precision], [note_with_offsets_velocity_recall], [note_with_offsets_velocity_f1 ], [processed_frame_predictions])
def _calculate_metrics_py( frame_predictions, onset_predictions, offset_predictions, velocity_values, sequence_label_str, frame_labels, sequence_id, hparams): """Python logic for calculating metrics on a single example.""" tf.logging.info('Calculating metrics for %s with length %d', sequence_id, frame_labels.shape[0]) if not hparams.predict_onset_threshold: onset_predictions = None if not hparams.predict_offset_threshold: offset_predictions = None sequence_prediction = sequences_lib.pianoroll_to_note_sequence( frames=frame_predictions, frames_per_second=data.hparams_frames_per_second(hparams), min_duration_ms=0, min_midi_pitch=constants.MIN_MIDI_PITCH, onset_predictions=onset_predictions, offset_predictions=offset_predictions, velocity_values=velocity_values) sequence_label = music_pb2.NoteSequence.FromString(sequence_label_str) if hparams.backward_shift_amount_ms: def shift_notesequence(ns_time): return ns_time + hparams.backward_shift_amount_ms / 1000. shifted_sequence_label, skipped_notes = ( sequences_lib.adjust_notesequence_times(sequence_label, shift_notesequence)) assert skipped_notes == 0 sequence_label = shifted_sequence_label est_intervals, est_pitches, est_velocities = ( infer_util.sequence_to_valued_intervals(sequence_prediction)) ref_intervals, ref_pitches, ref_velocities = ( infer_util.sequence_to_valued_intervals(sequence_label)) note_precision, note_recall, note_f1, _ = ( mir_eval.transcription.precision_recall_f1_overlap( ref_intervals, pretty_midi.note_number_to_hz(ref_pitches), est_intervals, pretty_midi.note_number_to_hz(est_pitches), offset_ratio=None)) (note_with_offsets_precision, note_with_offsets_recall, note_with_offsets_f1, _) = ( mir_eval.transcription.precision_recall_f1_overlap( ref_intervals, pretty_midi.note_number_to_hz(ref_pitches), est_intervals, pretty_midi.note_number_to_hz(est_pitches))) (note_with_offsets_velocity_precision, note_with_offsets_velocity_recall, note_with_offsets_velocity_f1, _) = ( mir_eval.transcription_velocity.precision_recall_f1_overlap( ref_intervals=ref_intervals, ref_pitches=pretty_midi.note_number_to_hz(ref_pitches), ref_velocities=ref_velocities, est_intervals=est_intervals, est_pitches=pretty_midi.note_number_to_hz(est_pitches), est_velocities=est_velocities)) processed_frame_predictions = sequences_lib.sequence_to_pianoroll( sequence_prediction, frames_per_second=data.hparams_frames_per_second(hparams), min_pitch=constants.MIN_MIDI_PITCH, max_pitch=constants.MAX_MIDI_PITCH).active if processed_frame_predictions.shape[0] < frame_labels.shape[0]: # Pad transcribed frames with silence. pad_length = frame_labels.shape[0] - processed_frame_predictions.shape[0] processed_frame_predictions = np.pad(processed_frame_predictions, [(0, pad_length), (0, 0)], 'constant') elif processed_frame_predictions.shape[0] > frame_labels.shape[0]: # Truncate transcribed frames. processed_frame_predictions = ( processed_frame_predictions[:frame_labels.shape[0], :]) tf.logging.info( 'Metrics for %s: Note F1 %f, Note w/ offsets F1 %f, ' 'Note w/ offsets & velocity: %f', sequence_id, note_f1, note_with_offsets_f1, note_with_offsets_velocity_f1) return (note_precision, note_recall, note_f1, note_with_offsets_precision, note_with_offsets_recall, note_with_offsets_f1, note_with_offsets_velocity_precision, note_with_offsets_velocity_recall, note_with_offsets_velocity_f1, processed_frame_predictions)
def model_inference(model_fn, model_dir, checkpoint_path, data_fn, hparams, examples_path, output_dir, summary_writer, master, preprocess_examples, shuffle_examples): """Runs inference for the given examples.""" tf.logging.info('model_dir=%s', model_dir) tf.logging.info('checkpoint_path=%s', checkpoint_path) tf.logging.info('examples_path=%s', examples_path) tf.logging.info('output_dir=%s', output_dir) estimator = train_util.create_estimator(model_fn, model_dir, hparams, master=master) transcription_data = functools.partial( data_fn, examples=examples_path, preprocess_examples=preprocess_examples, is_training=False, shuffle_examples=shuffle_examples, skip_n_initial_records=0) input_fn = infer_util.labels_to_features_wrapper(transcription_data) start_time = time.time() infer_times = [] num_frames = [] file_num = 0 all_metrics = collections.defaultdict(list) for predictions in estimator.predict(input_fn, checkpoint_path=checkpoint_path, yield_single_examples=False): # Remove batch dimension for convenience. for k in predictions.keys(): if predictions[k].shape[0] != 1: raise ValueError( 'All predictions must have batch size 1, but shape of ' '{} was: {}'.format(k, +predictions[k].shape[0])) predictions[k] = predictions[k][0] end_time = time.time() infer_time = end_time - start_time infer_times.append(infer_time) num_frames.append(predictions['frame_predictions'].shape[0]) tf.logging.info( 'Infer time %f, frames %d, frames/sec %f, running average %f', infer_time, num_frames[-1], num_frames[-1] / infer_time, np.sum(num_frames) / np.sum(infer_times)) tf.logging.info('Scoring sequence %s', predictions['sequence_ids']) sequence_prediction = music_pb2.NoteSequence.FromString( predictions['sequence_predictions']) sequence_label = music_pb2.NoteSequence.FromString( predictions['sequence_labels']) # Make filenames UNIX-friendly. filename_chars = six.ensure_text(predictions['sequence_ids'], 'utf-8') filename_chars = [c if c.isalnum() else '_' for c in filename_chars] filename_safe = ''.join(filename_chars).rstrip() filename_safe = '{:04d}_{}'.format(file_num, filename_safe[:200]) file_num += 1 output_file = os.path.join(output_dir, filename_safe + '.mid') tf.logging.info('Writing inferred midi file to %s', output_file) midi_io.sequence_proto_to_midi_file(sequence_prediction, output_file) label_output_file = os.path.join(output_dir, filename_safe + '_label.mid') tf.logging.info('Writing label midi file to %s', label_output_file) midi_io.sequence_proto_to_midi_file(sequence_label, label_output_file) # Also write a pianoroll showing acoustic model output vs labels. pianoroll_output_file = os.path.join(output_dir, filename_safe + '_pianoroll.png') tf.logging.info('Writing acoustic logit/label file to %s', pianoroll_output_file) # Calculate frames based on the sequence. Includes any postprocessing done # to turn raw onsets/frames predictions into the final sequence. # TODO(fjord): This work is duplicated in metrics.py. sequence_frame_predictions = sequences_lib.sequence_to_pianoroll( sequence_prediction, frames_per_second=data.hparams_frames_per_second(hparams), min_pitch=constants.MIN_MIDI_PITCH, max_pitch=constants.MAX_MIDI_PITCH).active with tf.gfile.GFile(pianoroll_output_file, mode='w') as f: imageio.imwrite(f, infer_util.posterior_pianoroll_image( predictions['onset_probs'], predictions['onset_labels'], predictions['frame_probs'], predictions['frame_labels'], sequence_frame_predictions), format='png') # Update histogram and current scalar for metrics. with tf.Graph().as_default(), tf.Session().as_default(): for k, v in predictions.items(): if not k.startswith('metrics/'): continue all_metrics[k].extend(v) histogram_name = k + '_histogram' metric_summary = tf.summary.histogram(histogram_name, all_metrics[k]) summary_writer.add_summary(metric_summary.eval(), global_step=file_num) scalar_name = k metric_summary = tf.summary.scalar(scalar_name, np.mean(all_metrics[k])) summary_writer.add_summary(metric_summary.eval(), global_step=file_num) summary_writer.flush() start_time = time.time() # Write final mean values for all metrics. with tf.Graph().as_default(), tf.Session().as_default(): for k, v in all_metrics.items(): final_scalar_name = 'final/' + k metric_summary = tf.summary.scalar(final_scalar_name, np.mean(all_metrics[k])) summary_writer.add_summary(metric_summary.eval()) summary_writer.flush()