def make_midi_cqt(midi_filename, piano, chroma, midi_info = None): if midi_info is None: midi_info = pretty_midi.PrettyMIDI(midi_filename) if piano: print "Generating CQT with piano roll" midi_gram = align_midi.midi_to_piano_cqt(midi_info) midi_beats, bpm = align_midi.midi_beat_track(midi_info) midi_gram = align_midi.post_process_cqt(midi_gram, midi_beats) np.save(to_piano_cqt_npy(midi_filename), midi_gram) return midi_gram elif chroma: chroma_gram = align_midi.midi_to_chroma(midi_info) midi_beats, bpm = align_midi.midi_beat_track(midi_info) chroma_gram = align_midi.post_process_cqt(chroma_gram, midi_beats) np.save(to_chroma_npy(midi_filename), chroma_gram) return chroma_gram else: midi_gram = align_midi.midi_to_cqt(midi_info, SF2_PATH) # Get beats midi_beats, bpm = align_midi.midi_beat_track(midi_info) # Beat synchronize and normalize midi_gram = align_midi.post_process_cqt(midi_gram, midi_beats) np.save(to_cqt_npy(midi_filename),midi_gram) return midi_gram
def make_midi_cqt(midi_filename, piano, chroma, midi_info=None): if midi_info is None: midi_info = pretty_midi.PrettyMIDI(midi_filename) if piano: print "Generating CQT with piano roll" midi_gram = align_midi.midi_to_piano_cqt(midi_info) midi_beats, bpm = align_midi.midi_beat_track(midi_info) midi_gram = align_midi.post_process_cqt(midi_gram, midi_beats) np.save(to_piano_cqt_npy(midi_filename), midi_gram) return midi_gram elif chroma: chroma_gram = align_midi.midi_to_chroma(midi_info) midi_beats, bpm = align_midi.midi_beat_track(midi_info) chroma_gram = align_midi.post_process_cqt(chroma_gram, midi_beats) np.save(to_chroma_npy(midi_filename), chroma_gram) return chroma_gram else: midi_gram = align_midi.midi_to_cqt(midi_info, SF2_PATH) # Get beats midi_beats, bpm = align_midi.midi_beat_track(midi_info) # Beat synchronize and normalize midi_gram = align_midi.post_process_cqt(midi_gram, midi_beats) np.save(to_cqt_npy(midi_filename), midi_gram) return midi_gram
def align_one_file(mp3_filename, midi_filename, output_midi_filename, output_diagnostics=True, interval=0): """ Helper function for aligning a MIDI file to an audio file. :parameters: - mp3_filename : str Full path to a .mp3 file. - midi_filename : str Full path to a .mid file. - output_midi_filename : str Full path to where the aligned .mid file should be written. If None, don't output. - output_diagnostics : bool If True, also output a .pdf of figures, a .mat of the alignment results, and a .mp3 of audio and synthesized aligned audio """ # Load in the corresponding midi file in the midi directory, and return if there is a problem loading it try: m = pretty_midi.PrettyMIDI(midi.read_midifile(midi_filename)) except: print "Error loading {}".format(midi_filename) return print "Aligning {}".format(os.path.split(midi_filename)[1]) # Cache audio CQT and onset strength audio, fs = librosa.load(mp3_filename) if use_mp3_data: if os.path.exists(to_cqt_npy(mp3_filename)) and os.path.exists(to_onset_strength_npy(mp3_filename)): print "Using pre-existing CQT and onset strength data for {}".format(os.path.split(mp3_filename)[1]) # Create audio CQT, which is just frame-wise power, and onset strength audio_gram = np.load(to_cqt_npy(mp3_filename)) audio_onset_strength = np.load(to_onset_strength_npy(mp3_filename)) else: print "Creating CQT and onset strength signal for {}".format(os.path.split(mp3_filename)[1]) audio_gram, audio_onset_strength = align_midi.audio_to_cqt_and_onset_strength(audio, fs=fs) np.save(to_cqt_npy(mp3_filename), audio_gram) np.save(to_onset_strength_npy(mp3_filename), audio_onset_strength) else: print "Creating CQT and onset strength signal for {}".format(os.path.split(mp3_filename)[1]) audio_gram, audio_onset_strength = align_midi.audio_to_cqt_and_onset_strength(audio, fs=fs) np.save(to_cqt_npy(mp3_filename), audio_gram) np.save(to_onset_strength_npy(mp3_filename), audio_onset_strength) print "Creating CQT for {}".format(os.path.split(midi_filename)[1]) # Generate synthetic MIDI CQT if piano: midi_gram = align_midi.midi_to_piano_cqt(m) # log_gram = librosa.logamplitude(midi_gram, ref_power=midi_gram.max()) # Normalize columns and return # midi_gram= librosa.util.normalize(log_gram, axis=0) midi_beats, bpm = align_midi.midi_beat_track(m) midi_gram = align_midi.post_process_cqt(midi_gram, midi_beats) else: midi_gram = align_midi.midi_to_cqt(m, SF2_PATH) # Get beats midi_beats, bpm = align_midi.midi_beat_track(m) # Beat synchronize and normalize midi_gram = align_midi.post_process_cqt(midi_gram, midi_beats) if interval != 0: midi_gram = shift_cqt(midi_gram, interval) # Compute beats midi_beats, bpm = align_midi.midi_beat_track(m) audio_beats = librosa.beat.beat_track(onsets=audio_onset_strength, hop_length=512 / 4, bpm=bpm)[1] / 4 # Beat-align and log/normalize the audio CQT audio_gram = align_midi.post_process_cqt(audio_gram, audio_beats) similarity_matrix = scipy.spatial.distance.cdist(midi_gram.T, audio_gram.T, metric="cosine") p, q, score = align_midi.dpmod(similarity_matrix) # Plot log-fs grams plt.figure(figsize=(36, 24)) ax = plt.subplot2grid((4, 3), (0, 0), colspan=3) plt.title("MIDI Synthesized") librosa.display.specshow( midi_gram, x_axis="frames", y_axis="cqt_note", fmin=librosa.midi_to_hz(36), fmax=librosa.midi_to_hz(96) ) ax = plt.subplot2grid((4, 3), (1, 0), colspan=3) plt.title("Audio data") librosa.display.specshow( audio_gram, x_axis="frames", y_axis="cqt_note", fmin=librosa.midi_to_hz(36), fmax=librosa.midi_to_hz(96) ) # Get similarity matrix similarity_matrix = scipy.spatial.distance.cdist(midi_gram.T, audio_gram.T, metric="cosine") # Get best path through matrix p, q, score = align_midi.dpmod(similarity_matrix) # Plot distance at each point of the lowst-cost path ax = plt.subplot2grid((4, 3), (2, 0), rowspan=2) plt.plot([similarity_matrix[p_v, q_v] for p_v, q_v in zip(p, q)]) plt.title("Distance at each point on lowest-cost path") # Plot similarity matrix and best path through it ax = plt.subplot2grid((4, 3), (2, 1), rowspan=2) plt.imshow(similarity_matrix.T, aspect="auto", interpolation="nearest", cmap=plt.cm.gray) tight = plt.axis() plt.plot(p, q, "r.", ms=0.2) plt.axis(tight) plt.title("Similarity matrix and lowest-cost path, cost={}".format(score)) # Adjust MIDI timing m_aligned = align_midi.adjust_midi(m, librosa.frames_to_time(midi_beats)[p], librosa.frames_to_time(audio_beats)[q]) # Plot alignment ax = plt.subplot2grid((4, 3), (2, 2), rowspan=2) note_ons = np.array([note.start for instrument in m.instruments for note in instrument.notes]) aligned_note_ons = np.array([note.start for instrument in m_aligned.instruments for note in instrument.notes]) plt.plot(note_ons, aligned_note_ons - note_ons, ".") plt.xlabel("Original note location (s)") plt.ylabel("Shift (s)") plt.title("Corrected offset") # Write out the aligned file if output_midi_filename is not None: m_aligned.write(output_midi_filename) if output_diagnostics: # Save the figures plt.savefig(output_midi_filename.replace(".mid", ".pdf")) if write_mp3: # Load in the audio data (needed for writing out) audio, fs = librosa.load(mp3_filename, sr=None) # Synthesize the aligned midi # midi_audio_aligned = m_aligned.fluidsynth() midi_audio_aligned = m_aligned.fluidsynth(fs=fs, sf2_path=SF2_PATH) # Trim to the same size as audio if midi_audio_aligned.shape[0] > audio.shape[0]: midi_audio_aligned = midi_audio_aligned[: audio.shape[0]] else: midi_audio_aligned = np.append( midi_audio_aligned, np.zeros(audio.shape[0] - midi_audio_aligned.shape[0]) ) # Write out to temporary .wav file librosa.output.write_wav( output_midi_filename.replace(".mid", ".wav"), np.vstack([midi_audio_aligned, audio]).T, fs ) # Convert to mp3 subprocess.check_output( [ "ffmpeg", "-i", output_midi_filename.replace(".mid", ".wav"), "-ab", "128k", "-y", output_midi_filename.replace(".mid", ".mp3"), ] ) # Remove temporary .wav file os.remove(output_midi_filename.replace(".mid", ".wav")) # Save a .mat of the results scipy.io.savemat( output_midi_filename.replace(".mid", ".mat"), {"similarity_matrix": similarity_matrix, "p": p, "q": q, "score": score}, ) # If we aren't outputting a .pdf, show the plot else: plt.show() plt.close()
def align_one_file(mp3_filename, midi_filename, output_midi_filename, output_diagnostics=True): ''' Helper function for aligning a MIDI file to an audio file. :parameters: - mp3_filename : str Full path to a .mp3 file. - midi_filename : str Full path to a .mid file. - output_midi_filename : str Full path to where the aligned .mid file should be written. If None, don't output. - output_diagnostics : bool If True, also output a .pdf of figures, a .mat of the alignment results, and a .mp3 of audio and synthesized aligned audio ''' # Load in the corresponding midi file in the midi directory, and return if there is a problem loading it try: m = pretty_midi.PrettyMIDI(midi.read_midifile(midi_filename)) except: print "Error loading {}".format(midi_filename) return print "Aligning {}".format(os.path.split(midi_filename)[1]) # Cache audio CQT and onset strength if not os.path.exists( to_onset_strength_npy(mp3_filename)) or not os.path.exists( to_cqt_npy(mp3_filename)): print "Creating CQT and onset strength signal for {}".format( os.path.split(mp3_filename)[1]) # Don't need to load in audio multiple times audio, fs = librosa.load(mp3_filename) # Create audio CQT, which is just frame-wise power, and onset strength audio_gram, audio_onset_strength = align_midi.audio_to_cqt_and_onset_strength( audio, fs=fs) # Write out np.save(to_onset_strength_npy(mp3_filename), audio_onset_strength) np.save(to_cqt_npy(mp3_filename), audio_gram) # Cache MIDI CQT if not os.path.exists(to_cqt_npy(midi_filename)): print "Creating CQT for {}".format(os.path.split(midi_filename)[1]) # Generate synthetic MIDI CQT midi_gram = align_midi.midi_to_cqt(m, SF2_PATH) # Get beats midi_beats, bpm = align_midi.midi_beat_track(m) # Beat synchronize and normalize midi_gram = align_midi.post_process_cqt(midi_gram, midi_beats) # Write out np.save(to_cqt_npy(midi_filename), midi_gram) # Load in CQTs audio_gram = np.load(to_cqt_npy(mp3_filename)) midi_gram = np.load(to_cqt_npy(midi_filename)) # and audio onset strength signal audio_onset_strength = np.load(to_onset_strength_npy(mp3_filename)) # Compute beats midi_beats, bpm = align_midi.midi_beat_track(m) audio_beats = librosa.beat.beat_track( onsets=audio_onset_strength, hop_length=512 / 4, bpm=bpm)[1] / 4 # Beat-align and log/normalize the audio CQT audio_gram = align_midi.post_process_cqt(audio_gram, audio_beats) # Plot log-fs grams plt.figure(figsize=(36, 24)) ax = plt.subplot2grid((4, 3), (0, 0), colspan=3) plt.title('MIDI Synthesized') librosa.display.specshow(midi_gram, x_axis='frames', y_axis='cqt_note', fmin=librosa.midi_to_hz(36), fmax=librosa.midi_to_hz(96)) ax = plt.subplot2grid((4, 3), (1, 0), colspan=3) plt.title('Audio data') librosa.display.specshow(audio_gram, x_axis='frames', y_axis='cqt_note', fmin=librosa.midi_to_hz(36), fmax=librosa.midi_to_hz(96)) # Get similarity matrix similarity_matrix = scipy.spatial.distance.cdist(midi_gram.T, audio_gram.T, metric='cosine') # Get best path through matrix p, q, score = align_midi.dpmod(similarity_matrix) # Plot distance at each point of the lowst-cost path ax = plt.subplot2grid((4, 3), (2, 0), rowspan=2) plt.plot([similarity_matrix[p_v, q_v] for p_v, q_v in zip(p, q)]) plt.title('Distance at each point on lowest-cost path') # Plot similarity matrix and best path through it ax = plt.subplot2grid((4, 3), (2, 1), rowspan=2) plt.imshow(similarity_matrix.T, aspect='auto', interpolation='nearest', cmap=plt.cm.gray) tight = plt.axis() plt.plot(p, q, 'r.', ms=.2) plt.axis(tight) plt.title('Similarity matrix and lowest-cost path, cost={}'.format(score)) # Adjust MIDI timing print np.shape(similarity_matrix) print len(p), np.max(p) print len(q), np.max(q) print len(midi_beats) print len(audio_beats) m_aligned = align_midi.adjust_midi(m, librosa.frames_to_time(midi_beats)[p], librosa.frames_to_time(audio_beats)[q]) # Plot alignment ax = plt.subplot2grid((4, 3), (2, 2), rowspan=2) note_ons = np.array([ note.start for instrument in m.instruments for note in instrument.events ]) aligned_note_ons = np.array([ note.start for instrument in m_aligned.instruments for note in instrument.events ]) plt.plot(note_ons, aligned_note_ons - note_ons, '.') plt.xlabel('Original note location (s)') plt.ylabel('Shift (s)') plt.title('Corrected offset') # Write out the aligned file if output_midi_filename is not None: m_aligned.write(output_midi_filename) if output_diagnostics: # Save the figures plt.savefig(output_midi_filename.replace('.mid', '.pdf')) # Load in the audio data (needed for writing out) audio, fs = librosa.load(mp3_filename, sr=None) # Synthesize the aligned midi midi_audio_aligned = m_aligned.synthesize(fs=fs, method=SF2_PATH) # Trim to the same size as audio if midi_audio_aligned.shape[0] > audio.shape[0]: midi_audio_aligned = midi_audio_aligned[:audio.shape[0]] else: midi_audio_aligned = np.append( midi_audio_aligned, np.zeros(audio.shape[0] - midi_audio_aligned.shape[0])) # Write out to temporary .wav file librosa.output.write_wav(output_midi_filename.replace('.mid', '.wav'), np.vstack([midi_audio_aligned, audio]).T, fs) # Convert to mp3 subprocess.check_output([ 'ffmpeg', '-i', output_midi_filename.replace('.mid', '.wav'), '-acodec', 'libvorbis', '-aq', '0', output_midi_filename.replace('.mid', '.ogg') ]) # Remove temporary .wav file os.remove(output_midi_filename.replace('.mid', '.wav')) # Save a .mat of the results scipy.io.savemat(output_midi_filename.replace('.mid', '.mat'), { 'similarity_matrix': similarity_matrix, 'p': p, 'q': q, 'score': score }) # If we aren't outputting a .pdf, show the plot else: plt.show() plt.close()
def align_one_file(mp3_filename, midi_filename, output_midi_filename, output_diagnostics=True): ''' Helper function for aligning a MIDI file to an audio file. :parameters: - mp3_filename : str Full path to a .mp3 file. - midi_filename : str Full path to a .mid file. - output_midi_filename : str Full path to where the aligned .mid file should be written. If None, don't output. - output_diagnostics : bool If True, also output a .pdf of figures, a .mat of the alignment results, and a .mp3 of audio and synthesized aligned audio ''' # Load in the corresponding midi file in the midi directory, and return if there is a problem loading it try: m = pretty_midi.PrettyMIDI(midi_filename) except: print "Error loading {}".format(midi_filename) return print "Aligning {}".format(os.path.split(midi_filename)[1]) #check if output path exists, and create it if necessary if not os.path.exists(os.path.split(output_midi_filename)[0]): os.makedirs(os.path.split(output_midi_filename)[0]) audio, fs = librosa.load(mp3_filename) if use_prev_data: if chroma: if os.path.exists(to_chroma_npy(mp3_filename)) and os.path.exists(to_onset_strength_npy(mp3_filename)): audio_gram = np.load(to_chroma_npy(mp3_filename)) audio_onset_strength = np.load(to_onset_strength_npy(mp3_filename)) else: print "Generating chroma features for {}".format(mp3_filename) audio_gram, audio_onset_strength = align_midi.audio_to_chroma_and_onset_strength(audio, fs = fs) np.save(to_chroma_npy(mp3_filename), audio_gram) np.save(to_onset_strength_npy(mp3_filename), audio_onset_strength) else: if os.path.exists(to_cqt_npy(mp3_filename)) and os.path.exists(to_onset_strength_npy(mp3_filename)): print "Using pre-existing CQT and onset strength data for {}".format(os.path.split(mp3_filename)[1]) # Create audio CQT, which is just frame-wise power, and onset strength audio_gram = np.load(to_cqt_npy(mp3_filename)) audio_onset_strength = np.load(to_onset_strength_npy(mp3_filename)) else: print "Creating CQT and onset strength signal for {}".format(os.path.split(mp3_filename)[1]) audio_gram, audio_onset_strength = align_midi.audio_to_cqt_and_onset_strength(audio, fs=fs) np.save(to_cqt_npy(mp3_filename), audio_gram) np.save(to_onset_strength_npy(mp3_filename), audio_onset_strength) else: print "Creating CQT and onset strength signal for {}".format(os.path.split(mp3_filename)[1]) if chroma: audio_gram, audio_onset_strength = align_midi.audio_to_chroma_and_onset_strength(audio, fs = fs) np.save(to_chroma_npy(mp3_filename), audio_gram) np.save(to_onset_strength_npy(mp3_filename), audio_onset_strength) else: audio_gram, audio_onset_strength = align_midi.audio_to_cqt_and_onset_strength(audio, fs=fs) np.save(to_cqt_npy(mp3_filename), audio_gram) np.save(to_onset_strength_npy(mp3_filename), audio_onset_strength) if use_prev_data and not make_midi_info: if piano: if os.path.exists(to_piano_cqt_npy(midi_filename)): midi_gram = np.load(to_piano_cqt_npy(midi_filename)) else: print "Creating CQT for {}".format(os.path.split(midi_filename)[1]) midi_gram = make_midi_cqt(midi_filename, piano,chroma, m) elif chroma: if os.path.exists(to_chroma_npy(midi_filename)): midi_gram = np.load(to_chroma_npy(midi_filename)) else: print "Creating CQT for {}".format(os.path.split(midi_filename)[1]) midi_gram = make_midi_cqt(midi_filename, piano,chroma, m) else: if os.path.exists(to_cqt_npy(midi_filename)): midi_gram = np.load(to_cqt_npy(midi_filename)) else: print "Creating CQT for {}".format(os.path.split(midi_filename)[1]) midi_gram = make_midi_cqt(midi_filename, piano,chroma, m) else: print "Creating CQT for {}".format(os.path.split(midi_filename)[1]) # Generate synthetic MIDI CQT midi_gram = make_midi_cqt(midi_filename, piano,chroma, m) if piano: # midi_gram = align_midi.accentuate_onsets(midi_gram) midi_gram = align_midi.piano_roll_fuzz(midi_gram) # midi_gram = align_midi.clean_audio_gram(midi_gram, threshold = np.percentile(midi_gram,40)) midi_gram = librosa.util.normalize(midi_gram, axis = 0) # Compute beats midi_beats, bpm = align_midi.midi_beat_track(m) audio_beats = librosa.beat.beat_track(onset_envelope=audio_onset_strength, hop_length=512/4, bpm=bpm)[1]/4 # Beat-align and log/normalize the audio CQT audio_gram = align_midi.post_process_cqt(audio_gram, audio_beats) # Plot log-fs grams # audio_gram = align_midi.clean_audio_gram(audio_gram, threshold = np.percentile(audio_gram, 80)) plt.figure(figsize=(36, 24)) ax = plt.subplot2grid((4, 3), (0, 0), colspan=3) plt.title('MIDI Synthesized') librosa.display.specshow(midi_gram, x_axis='frames', y_axis='cqt_note', fmin=librosa.midi_to_hz(36), fmax=librosa.midi_to_hz(96)) ax = plt.subplot2grid((4, 3), (1, 0), colspan=3) plt.title('Audio data') librosa.display.specshow(audio_gram, x_axis='frames', y_axis='cqt_note', fmin=librosa.midi_to_hz(36), fmax=librosa.midi_to_hz(96)) # Get similarity matrix similarity_matrix = scipy.spatial.distance.cdist(midi_gram.T, audio_gram.T, metric='cosine') # Get best path through matrix p, q, score = align_midi.dpmod(similarity_matrix,experimental = False, forceH = False) # Plot distance at each point of the lowst-cost path ax = plt.subplot2grid((4, 3), (2, 0), rowspan=2) plt.plot([similarity_matrix[p_v, q_v] for p_v, q_v in zip(p, q)]) plt.title('Distance at each point on lowest-cost path') # Plot similarity matrix and best path through it ax = plt.subplot2grid((4, 3), (2, 1), rowspan=2) plt.imshow(similarity_matrix.T, aspect='auto', interpolation='nearest', cmap=plt.cm.gray) tight = plt.axis() plt.plot(p, q, 'r.', ms=.2) plt.axis(tight) plt.title('Similarity matrix and lowest-cost path, cost={}'.format(score)) # Adjust MIDI timing m_aligned = align_midi.adjust_midi(m, librosa.frames_to_time(midi_beats)[p], librosa.frames_to_time(audio_beats)[q]) # Plot alignment ax = plt.subplot2grid((4, 3), (2, 2), rowspan=2) note_ons = np.array([note.start for instrument in m.instruments for note in instrument.events]) aligned_note_ons = np.array([note.start for instrument in m_aligned.instruments for note in instrument.events]) plt.plot(note_ons, aligned_note_ons - note_ons, '.') plt.xlabel('Original note location (s)') plt.ylabel('Shift (s)') plt.title('Corrected offset') # Write out the aligned file if output_midi_filename is not None: m_aligned.write(output_midi_filename) if output_diagnostics: # Save the figures plt.savefig(output_midi_filename.replace('.mid', '.pdf')) if write_mp3: # Load in the audio data (needed for writing out) audio, fs = librosa.load(mp3_filename, sr=None) # Synthesize the aligned midi midi_audio_aligned = m_aligned.fluidsynth(fs=fs, sf2_path=SF2_PATH) # Trim to the same size as audio if midi_audio_aligned.shape[0] > audio.shape[0]: midi_audio_aligned = midi_audio_aligned[:audio.shape[0]] else: midi_audio_aligned = np.append(midi_audio_aligned, np.zeros(audio.shape[0] - midi_audio_aligned.shape[0])) # Write out to temporary .wav file librosa.output.write_wav(output_midi_filename.replace('.mid', '.wav'), np.vstack([midi_audio_aligned, audio]).T, fs) # Convert to mp3 subprocess.check_output(['ffmpeg', '-i', output_midi_filename.replace('.mid', '.wav'), '-ab', '128k', '-y', output_midi_filename.replace('.mid', '.mp3')]) # Remove temporary .wav file os.remove(output_midi_filename.replace('.mid', '.wav')) # Save a .mat of the results scipy.io.savemat(output_midi_filename.replace('.mid', '.mat'), {'similarity_matrix': similarity_matrix, 'p': p, 'q': q, 'score': score}) # If we aren't outputting a .pdf, show the plot else: plt.show() plt.close()