def extract_cfp(filename, down_fs=44100, **kwargs): """CFP feature extraction function. Given the audio path, returns the CFP feature. Will automatically process the feature in parallel to accelerate the computation. Parameters ---------- filename: Path Path to the audio. hop: float Hop size in seconds, with regard to the sampling rate. win_size: int Window size. fr: float Frequency resolution. fc: float Lowest start frequency. tc: float Inverse number of the highest frequency bound. g: list[float] Power factor of the output STFT results. bin_per_octave: int Number of bins in each octave. down_fs: int Resample to this sampling rate, if the loaded audio has a different value. max_sample: int Maximum number of frames to be processed for each computation. Adjust to a smaller number if your RAM is not enough. Returns ------- Z Multiplication of spectrum and cepstrum tfrL0 Spectrum of the audio. tfrLF Generalized Cepstrum of Spectrum (GCoS). tfrLQ Cepstrum of the audio cen_freq Central frequencies to each feature. References ---------- The CFP approach was first proposed in [1]_ .. [1] L. Su and Y. Yang, "Combining Spectral and Temporal Representations for Multipitch Estimation of Polyphonic Music," in IEEE/ACM Transactions on Audio, Speech, and Language Processing, 2015. """ logger.debug("Loading audio: %s", filename) x, fs = load_audio(filename, sampling_rate=down_fs) return _extract_cfp(x, fs, down_fs=fs, **kwargs)
def extract_cqt(audio_path, sampling_rate=44100, lowest_note=16, note_num=120, a_hop=256, pad_sec=1): """ Compute some audio data's constant-Q spectrogram, normalize, and log-scale it Parameters ---------- audio_data: Path Path to the input audio. sampling_rate: int Sampling rate the audio data is sampled at, should be ``DOWN_SAMPLE_TO_SAPMLING_RATE``. lowest_note: int Lowest MIDI note number. note_num: int Number of total notes. The highest note number would thus be `lowest_note` + `note_num`. a_hop: int Hop size for computing CQT. pad_sec: float Length of padding to the begin and the end of the raw audio data in seconds. Returns ------- midi_gram: np.ndarray Log-magnitude, L2-normalized constant-Q spectrogram of synthesized MIDI data. """ logger.debug("Loading audio: %s", audio_path) audio_data, _ = load_audio(audio_path, sampling_rate=sampling_rate) zeros = np.zeros(pad_sec * sampling_rate) padded_audio = np.concatenate([zeros, audio_data, zeros]) # Compute CQT of the synthesized audio data logger.debug("Extracting CQT feature with librosa") audio_gram = librosa.cqt(padded_audio, sr=sampling_rate, hop_length=a_hop, fmin=librosa.midi_to_hz(lowest_note), n_bins=note_num) # L2-normalize and log-magnitute it logger.debug("Post-processing CQT feature...") return post_process_cqt(audio_gram)
def extract_beat_with_madmom(audio_path, sampling_rate=44100): """Extract beat position (in seconds) of the audio. Extract beat with mixture of beat tracking techiniques using madmom. Parameters ---------- audio_path: Path Path to the target audio sampling_rate: int Desired sampling to be resampled. Returns ------- beat_arr: 1D numpy array Contains beat positions in seconds. audio_len_sec: float Total length of the audio in seconds. """ logger.debug("Loading audio: %s", audio_path) audio_data, _ = load_audio(audio_path, sampling_rate=sampling_rate) logger.debug("Runnig beat tracking...") return MadmomBeatTracking().process( audio_data), len(audio_data) / sampling_rate
def transcribe(self, input_audio, model_path=None, output="./"): """Transcribe vocal notes in the audio. This function transcribes onset, offset, and pitch of the vocal in the audio. This module is reponsible for predicting onset and offset time of each note, and pitches are estimated by the `vocal-contour` submodule. Parameters ---------- input_audio: Path Path to the raw audio file (.wav). model_path: Path Path to the trained model or the supported transcription mode. output: Path (optional) Path for writing out the transcribed MIDI file. Default to the current path. Returns ------- midi: pretty_midi.PrettyMIDI The transcribed vocal notes. Outputs ------- This function will outputs three files as listed below: - <song>.mid: the MIDI file with complete transcription results in piano sondfount. - <song>_f0.csv: pitch contour information of the vocal. - <song>_trans.wav: the rendered pitch contour audio. See Also -------- omnizart.cli.vocal.transcribe: CLI entry point of this function. omnizart.vocal_contour.transcribe: Pitch estimation function. """ logger.info("Separating vocal track from the audio...") separator = Separator('spleeter:2stems') # Tricky way to avoid the annoying tensorflow graph being finalized issue. separator._params["stft_backend"] = "librosa" # pylint: disable=protected-access wav, fs = load_audio(input_audio, mono=False) pred = separator.separate(wav) logger.info("Loading model...") model, model_settings = self._load_model(model_path) logger.info("Extracting feature...") wav = librosa.to_mono(pred["vocals"].squeeze().T) feature = _extract_vocal_cfp( wav, fs, down_fs=model_settings.feature.sampling_rate, hop=model_settings.feature.hop_size, fr=model_settings.feature.frequency_resolution, fc=model_settings.feature.frequency_center, tc=model_settings.feature.time_center, g=model_settings.feature.gamma, bin_per_octave=model_settings.feature.bins_per_octave) logger.info("Predicting...") pred = predict(feature, model) logger.info("Infering notes...") interval = infer_interval( pred, ctx_len=model_settings.inference.context_length, threshold=model_settings.inference.threshold, min_dura=model_settings.inference.min_duration, t_unit=model_settings.feature.hop_size) logger.info("Extracting pitch contour") agg_f0 = vcapp.app.transcribe( input_audio, model_path=model_settings.inference.pitch_model, output=output) logger.info("Inferencing MIDI...") midi = infer_midi(interval, agg_f0, t_unit=model_settings.feature.hop_size) self._output_midi(output=output, input_audio=input_audio, midi=midi) logger.info("Transcription finished") return midi
def test_load_audio(): audio = "./tests/resource/sample.wav" data, fs = io.load_audio(audio, sampling_rate=44100, mono=False) assert fs == 44100 assert data.shape == (2065124, 2)
def extract_vocal_cfp(filename, down_fs=16000, **kwargs): """Specialized CFP feature extraction for vocal submodule.""" logger.debug("Loading audio: %s", filename) x, fs = load_audio(filename, sampling_rate=down_fs) logger.debug("Extracting vocal feature") return _extract_vocal_cfp(x, fs, **kwargs)
def transcribe(self, input_audio, model_path=None, output="./"): """Transcribe vocal notes in the audio. This function transcribes onset, offset, and pitch of the vocal in the audio. This module is reponsible for predicting onset and offset time of each note, and pitches are estimated by the `vocal-contour` submodule. Parameters ---------- input_audio: Path Path to the raw audio file (.wav). model_path: Path Path to the trained model or the supported transcription mode. output: Path (optional) Path for writing out the transcribed MIDI file. Default to the current path. Returns ------- midi: pretty_midi.PrettyMIDI The transcribed vocal notes. Outputs ------- This function will outputs three files as listed below: - <song>.mid: the MIDI file with complete transcription results in piano sondfount. - <song>_f0.csv: pitch contour information of the vocal. - <song>_trans.wav: the rendered pitch contour audio. See Also -------- omnizart.cli.vocal.transcribe: CLI entry point of this function. omnizart.vocal_contour.transcribe: Pitch estimation function. """ logger.info("Separating vocal track from the audio...") command = ["spleeter", "separate", input_audio, "-o", "./"] process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) _, error = process.communicate() if process.returncode != 0: raise SpleeterError(error.decode("utf-8")) # Resolve the path of separated output files folder_path = jpath("./", get_filename(input_audio)) vocal_wav_path = jpath(folder_path, "vocals.wav") wav, fs = load_audio(vocal_wav_path) # Clean out the output files shutil.rmtree(folder_path) logger.info("Loading model...") model, model_settings = self._load_model(model_path) logger.info("Extracting feature...") feature = _extract_vocal_cfp( wav, fs, down_fs=model_settings.feature.sampling_rate, hop=model_settings.feature.hop_size, fr=model_settings.feature.frequency_resolution, fc=model_settings.feature.frequency_center, tc=model_settings.feature.time_center, g=model_settings.feature.gamma, bin_per_octave=model_settings.feature.bins_per_octave ) logger.info("Predicting...") pred = predict(feature, model) logger.info("Infering notes...") interval = infer_interval( pred, ctx_len=model_settings.inference.context_length, threshold=model_settings.inference.threshold, min_dura=model_settings.inference.min_duration, t_unit=model_settings.feature.hop_size ) logger.info("Extracting pitch contour") agg_f0 = vcapp.app.transcribe(input_audio, model_path=model_settings.inference.pitch_model, output=output) logger.info("Inferencing MIDI...") midi = infer_midi(interval, agg_f0, t_unit=model_settings.feature.hop_size) self._output_midi(output=output, input_audio=input_audio, midi=midi) logger.info("Transcription finished") return midi