def evaluate_clustering(x, clusters): """ Returns the root mean square of the differences between array points and their closest clusters. """ x = _np.asarray(x) error = 0 # Can be heavily optimized, but let's assume there are few clusters. for point in x: c = _mh.find_nearest_value(clusters, point) error += (point - c) ** 2 return _np.sqrt(error / x.size)
def evaluate_clustering(x, clusters): """ Returns the root mean square of the differences between array points and their closest clusters. """ x = _np.asarray(x) error = 0 # Can be heavily optimized, but let's assume there are few clusters. for point in x: c = _mh.find_nearest_value(clusters, point) error += (point - c)**2 return _np.sqrt(error / x.size)
def plot_tracking(audiopath, title="", binsize=1470, tune=False, plotpath=None, repetitions=10): """ Plots the HPS tracking of an audio file. """ samplerate, samples = _sf.readfile(audiopath) detections = samples.size // binsize p = _np.zeros(repetitions * detections) for i in range(detections): f = _hps.hps(samples[i * binsize:(i + 1) * binsize]) if tune: f = _mh.find_nearest_value(_mt.notes, f) p = _np.repeat(p, repetitions) _pl.plot(p) _pl.title(title) xlocs = _np.linspace(0, 10 * detections, 5) _pl.xlabel("Time (s)") _pl.xlim([0, _np.max(xlocs)]) _pl.xticks(xlocs, [ "%.2f" % l for l in _np.multiply(xlocs, binsize / (repetitions * samplerate)) ]) _pl.ylabel("Fundamental Frequency (Hz)") _pl.ylim((0.9 * _np.min(p), 1.05 * _np.max(p))) if plotpath: _pl.savefig(plotpath, bbox_inches="tight") else: _pl.show() _pl.clf()
def finalize(self): """ Finalizes the transcriber. Will close resources, perform calculations and normalizations based on the whole transcription (e.g.: normalize note duration) then writes the transcription to the desired outputs. """ self.close() # Extract the last note. if self.current_ticks > 2: self.notes.append({ "name": self.current_note, "duration": np.log2(self.current_ticks), "ticks": self.current_ticks, "slur": "stop" if self.currently_slurring else False }) if DEBUG_NOTE: print("%s\t %d\t %.3fs" % (self.current_note, self.current_ticks, self.current_ticks / self.blocks_per_sec)) print("\n\n###### Detected notes:") for note in self.notes: print(note) durations = np.array([n["duration"] for n in self.notes]) clusters = clst.equidistant_clusterize(durations) corrected_notes = [{ "name": n["name"], "duration": 2**mh.find_nearest_value(clusters, n["duration"]), "slur": n["slur"] } for n in self.notes] print("\n\n###### Corrected notes:") for note in corrected_notes: print(note) most_common = scipy.stats.mode( [n["duration"] for n in corrected_notes])[0][0] tempo = int(round(60 * self.blocks_per_sec / most_common, 0)) while tempo < 80: tempo *= 2 most_common /= 2 while tempo > 220: tempo /= 2 most_common *= 2 if WRITE_MIDI or WRITE_XML: s = music21.stream.Stream() s.append(music21.tempo.MetronomeMark(number=tempo)) s.append(music21.meter.TimeSignature('4/4')) for note in corrected_notes: n = music21.note.Note() n.pitch.name = note["name"] n.duration.quarterLength = note["duration"] / most_common s.append(n) note["music21"] = n slurring = False slur = music21.spanner.Slur() for note in corrected_notes: if note["slur"] == "start": slurring = True if slurring: slur.addSpannedElements([note["music21"]]) if note["slur"] == "stop": slurring = False s.insert(0, slur) slur = music21.spanner.Slur() if slurring: s.insert(0, slur) s.insert(0, s.analyze('key')) if WRITE_MIDI: sf.write_m21stream_to_midi(s, MIDI_FILENAME) if WRITE_XML: s.show('musicxml') if WRITE_OUT: print("### Writing processed output file") f = open(OUT_FILENAME, 'w') f.write(self.out) f.flush() os.fsync(f) return
def update(self): """ Performs a transcription iteration update, i.e. wait for microphone data to be available, then update the transcriber state (detect pitch, note duration, etc). """ if not self.tong: raise AssertionError( "Please initialize the tonguing detector first. (missing a call to detect_noise()?)" ) self.total_ticks += 1 new_samples = self.mic.listen() if DEBUG_PERF: rms_start_time = time.time() rms = np.sqrt(np.mean(np.square(new_samples))) if DEBUG_PERF: self.rms_time = time.time() - rms_start_time tong_start_time = time.time() # Feed the new samples to the Tonguing Detector. # Beware we shouldn't send repeated samples, so we send the new_samples and not the entire block. tongued = self.tong.feed(new_samples) if DEBUG_PERF: self.tong_time = time.time() - tong_start_time if tongued: if self.current_ticks > 2: # We detected tonguing, so split the current note. # TODO: if 'previous_note' is considered noisy, account for it in the duration. # TODO: Consider whether we should increment the current tick partially (proportionally to the audible portion?). self.notes.append({ "name": self.current_note, "duration": np.log2(self.current_ticks), "ticks": self.current_ticks, "slur": "stop" if self.currently_slurring else False }) self.currently_slurring = False if DEBUG_NOTE: print("%s\t %d\t %.3fs" % (self.current_note, self.current_ticks, self.current_ticks / self.blocks_per_sec)) if DEBUG_TONG: print("TONG") if WRITE_OUT: self.out += "%d\t: TONG\n" % self.total_ticks self.current_ticks = 0 # Add the new_samples at the beginning of the block, so they replace the oldest values. self.block[0:self.samples_per_read] = new_samples # Now roll the block back so that it is in chronological order. # Not strictly necessary as we're discarding phase, but it makes replacing old values easier and also ensures # usual windowing will smooth discontinuities at the borders. self.block = np.roll(self.block, -self.samples_per_read) # No need to proceed if we're to discard the pitch due to insufficient RMS power in the block. if not DEBUG_NOISE and rms < self.noise_threshold: self.out += "%d\t: 'rms < self.noise_threshold'\n" % self.total_ticks return # We want pitch, so pass the block to the PDA if DEBUG_PERF: hps_start_time = time.time() perceived_f = pda.hps.hps(self.block, fs=self.rate, harmonics=3, precision=2) if DEBUG_PERF: self.hps_time = time.time() - hps_start_time # and tune the pitch down to a known note. tuned_f = mh.find_nearest_value(mt.notes, perceived_f) note = mt.note_name[tuned_f] # TODO: rough error percentage estimate error = perceived_f - tuned_f percentage = np.sign(error) * 2 * error / (tuned_f * (1 + mt.semitone) if error > 0 else tuned_f * (1 - mt.semitone)) if note == self.current_note: if self.current_note == self.previous_note: # We're receiving a new sample of the current note. self.current_ticks += 1 else: # Our 'previous_note' measurement was probably noisy. # Pretend it was a measurement of 'current_note', and account ticks for both. self.current_ticks += 2 elif note == self.previous_note: # Keep in mind that all notes are 'tentative' until their tick count is > n, so: # - C5 C5 C5 D5 D5 means we successfully identified a C5 and the beginning of a D5, assuming n is 1. if self.current_ticks > 2: self.notes.append({ "name": self.current_note, "duration": np.log2(self.current_ticks), "ticks": self.current_ticks, "slur": "continue" if self.currently_slurring else "start" }) self.currently_slurring = True if DEBUG_NOTE: print("%s\t %d\t %.3fs" % (self.current_note, self.current_ticks, self.current_ticks / self.blocks_per_sec)) self.current_note = note self.current_ticks = 2 elif self.previous_note != self.current_note: # Experimentally, it's pretty rare to have 2 noisy detections in a row, so if we find 2 different measurements # we can assume the old note has ended. # - C5 C5 C5 D5 E5 means we identified a C5 end, but we don't know the next note yet. if self.current_ticks > 2: self.notes.append({ "name": self.current_note, "duration": np.log2(self.current_ticks), "ticks": self.current_ticks, "slur": "continue" if self.currently_slurring else False }) if DEBUG_NOTE: print("%s\t %d\t %.3fs" % (self.current_note, self.current_ticks, self.current_ticks / self.blocks_per_sec)) # We currently have no idea of the note being played, so assign an error string to it. # When we have k identical detections in a row (with k defined in the elifs above) we will successfully # assign the current note. self.current_note = "NOISE_ERR" self.current_ticks = 0 self.previous_note = note if DEBUG_TICK: print("%s\t (%.3f)\t@ %.2f" % (note, percentage, rms)) if WRITE_OUT: self.out += "%d\t: %s\t (%.3f)\t@ %.2f\r\n" % ( self.total_ticks, note, percentage, rms) return
def finalize(self): """ Finalizes the transcriber. Will close resources, perform calculations and normalizations based on the whole transcription (e.g.: normalize note duration) then writes the transcription to the desired outputs. """ self.close() # Extract the last note. if self.current_ticks > 2: self.notes.append({"name": self.current_note, "duration": np.log2(self.current_ticks), "ticks": self.current_ticks, "slur": "stop" if self.currently_slurring else False}) if DEBUG_NOTE: print("%s\t %d\t %.3fs"%(self.current_note, self.current_ticks, self.current_ticks/self.blocks_per_sec)) print("\n\n###### Detected notes:") for note in self.notes: print(note) durations = np.array([n["duration"] for n in self.notes]) clusters = clst.equidistant_clusterize(durations) corrected_notes = [{"name": n["name"], "duration": 2**mh.find_nearest_value(clusters, n["duration"]), "slur": n["slur"]} for n in self.notes] print("\n\n###### Corrected notes:") for note in corrected_notes: print(note) most_common = scipy.stats.mode([n["duration"] for n in corrected_notes])[0][0] tempo = int(round(60*self.blocks_per_sec/most_common, 0)) while tempo < 80: tempo *= 2 most_common /= 2 while tempo > 220: tempo /= 2 most_common *= 2 if WRITE_MIDI or WRITE_XML: s = music21.stream.Stream() s.append(music21.tempo.MetronomeMark(number=tempo)) s.append(music21.meter.TimeSignature('4/4')) for note in corrected_notes: n = music21.note.Note() n.pitch.name = note["name"] n.duration.quarterLength = note["duration"]/most_common s.append(n) note["music21"] = n slurring = False slur = music21.spanner.Slur() for note in corrected_notes: if note["slur"] == "start": slurring = True if slurring: slur.addSpannedElements([note["music21"]]) if note["slur"] == "stop": slurring = False s.insert(0, slur) slur = music21.spanner.Slur() if slurring: s.insert(0, slur) s.insert(0, s.analyze('key')) if WRITE_MIDI: sf.write_m21stream_to_midi(s, MIDI_FILENAME) if WRITE_XML: s.show('musicxml') if WRITE_OUT: print("### Writing processed output file") f = open(OUT_FILENAME, 'w') f.write(self.out) f.flush() os.fsync(f) return
def update(self): """ Performs a transcription iteration update, i.e. wait for microphone data to be available, then update the transcriber state (detect pitch, note duration, etc). """ if not self.tong: raise AssertionError("Please initialize the tonguing detector first. (missing a call to detect_noise()?)") self.total_ticks += 1 new_samples = self.mic.listen() if DEBUG_PERF: rms_start_time = time.time() rms = np.sqrt(np.mean(np.square(new_samples))) if DEBUG_PERF: self.rms_time = time.time() - rms_start_time tong_start_time = time.time() # Feed the new samples to the Tonguing Detector. # Beware we shouldn't send repeated samples, so we send the new_samples and not the entire block. tongued = self.tong.feed(new_samples) if DEBUG_PERF: self.tong_time = time.time() - tong_start_time if tongued: if self.current_ticks > 2: # We detected tonguing, so split the current note. # TODO: if 'previous_note' is considered noisy, account for it in the duration. # TODO: Consider whether we should increment the current tick partially (proportionally to the audible portion?). self.notes.append({"name": self.current_note, "duration": np.log2(self.current_ticks), "ticks": self.current_ticks, "slur": "stop" if self.currently_slurring else False}) self.currently_slurring = False if DEBUG_NOTE: print("%s\t %d\t %.3fs"%(self.current_note, self.current_ticks, self.current_ticks/self.blocks_per_sec)) if DEBUG_TONG: print("TONG") if WRITE_OUT: self.out += "%d\t: TONG\n" % self.total_ticks self.current_ticks = 0 # Add the new_samples at the beginning of the block, so they replace the oldest values. self.block[0:self.samples_per_read] = new_samples # Now roll the block back so that it is in chronological order. # Not strictly necessary as we're discarding phase, but it makes replacing old values easier and also ensures # usual windowing will smooth discontinuities at the borders. self.block = np.roll(self.block, -self.samples_per_read) # No need to proceed if we're to discard the pitch due to insufficient RMS power in the block. if not DEBUG_NOISE and rms < self.noise_threshold: self.out += "%d\t: 'rms < self.noise_threshold'\n" % self.total_ticks return; # We want pitch, so pass the block to the PDA if DEBUG_PERF: hps_start_time = time.time() perceived_f = pda.hps.hps(self.block, fs=self.rate, harmonics=3, precision=2) if DEBUG_PERF: self.hps_time = time.time() - hps_start_time # and tune the pitch down to a known note. tuned_f = mh.find_nearest_value(mt.notes, perceived_f) note = mt.note_name[tuned_f] # TODO: rough error percentage estimate error = perceived_f - tuned_f percentage = np.sign(error) * 2 * error/(tuned_f*(1 + mt.semitone) if error > 0 else tuned_f*(1 - mt.semitone)) if note == self.current_note: if self.current_note == self.previous_note: # We're receiving a new sample of the current note. self.current_ticks += 1 else: # Our 'previous_note' measurement was probably noisy. # Pretend it was a measurement of 'current_note', and account ticks for both. self.current_ticks += 2 elif note == self.previous_note: # Keep in mind that all notes are 'tentative' until their tick count is > n, so: # - C5 C5 C5 D5 D5 means we successfully identified a C5 and the beginning of a D5, assuming n is 1. if self.current_ticks > 2: self.notes.append({"name": self.current_note, "duration": np.log2(self.current_ticks), "ticks": self.current_ticks, "slur": "continue" if self.currently_slurring else "start"}) self.currently_slurring = True if DEBUG_NOTE: print("%s\t %d\t %.3fs"%(self.current_note, self.current_ticks, self.current_ticks/self.blocks_per_sec)) self.current_note = note self.current_ticks = 2 elif self.previous_note != self.current_note: # Experimentally, it's pretty rare to have 2 noisy detections in a row, so if we find 2 different measurements # we can assume the old note has ended. # - C5 C5 C5 D5 E5 means we identified a C5 end, but we don't know the next note yet. if self.current_ticks > 2: self.notes.append({"name": self.current_note, "duration": np.log2(self.current_ticks), "ticks": self.current_ticks, "slur": "continue" if self.currently_slurring else False}) if DEBUG_NOTE: print("%s\t %d\t %.3fs"%(self.current_note, self.current_ticks, self.current_ticks/self.blocks_per_sec)) # We currently have no idea of the note being played, so assign an error string to it. # When we have k identical detections in a row (with k defined in the elifs above) we will successfully # assign the current note. self.current_note = "NOISE_ERR" self.current_ticks = 0 self.previous_note = note if DEBUG_TICK: print("%s\t (%.3f)\t@ %.2f" % (note, percentage, rms)) if WRITE_OUT: self.out += "%d\t: %s\t (%.3f)\t@ %.2f\r\n" % (self.total_ticks, note, percentage, rms) return