def __init__(self, fs, hopsize_t):
        from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
        from madmom.audio.stft import ShortTimeFourierTransformProcessor
        from madmom.audio.filters import MelFilterbank
        from madmom.audio.spectrogram import (FilteredSpectrogramProcessor,
                                              LogarithmicSpectrogramProcessor)

        # define pre-processing chain
        sig = SignalProcessor(num_channels=1, sample_rate=fs)
        frames = FramedSignalProcessor(frame_size=2048,
                                       hopsize=int(fs * hopsize_t))
        stft = ShortTimeFourierTransformProcessor()  # caching FFT window
        filt = FilteredSpectrogramProcessor(filterbank=MelFilterbank,
                                            num_bands=80,
                                            fmin=27.5,
                                            fmax=16000,
                                            norm_filters=True,
                                            unique_filters=False)
        spec = LogarithmicSpectrogramProcessor(log=np.log, add=EPSILON)

        single = SequentialProcessor([frames, stft, filt, spec])

        pre_processor = SequentialProcessor([sig, single])

        super(MadmomMelbankProcessor, self).__init__([pre_processor])
Exemplo n.º 2
0
def CreateProcesser(fps=100):
    # define pre-processing chain
    sig = SignalProcessor(num_channels=1, sample_rate=44100)
    # process the multi-resolution spec & diff in parallel
    # process the multi-resolution spec & diff in parallel
    multi = ParallelProcessor([])
    frame_sizes = [1024, 2048, 4096]
    num_bands = [3, 6, 12]
    for frame_size, num_bands in zip(frame_sizes, num_bands):
        frames = FramedSignalProcessor(frame_size=frame_size, fps=fps)
        stft = ShortTimeFourierTransformProcessor()  # caching FFT window
        filt = FilteredSpectrogramProcessor(num_bands=num_bands,
                                            fmin=30,
                                            fmax=17000,
                                            norm_filters=True)
        spec = LogarithmicSpectrogramProcessor(mul=1, add=1)
        diff = SpectrogramDifferenceProcessor(diff_ratio=0.5,
                                              positive_diffs=True,
                                              stack_diffs=np.hstack)
        # process each frame size with spec and diff sequentially
        multi.append(SequentialProcessor((frames, stft, filt, spec, diff)))

    # stack the features and processes everything sequentially
    pre_processor = SequentialProcessor((sig, multi, np.hstack))
    return pre_processor
    def __init__(self,
                 spectrogram_path=None,
                 version=1,
                 test=False,
                 dump=False,
                 preprocessing=True,
                 sample_rate=32000,
                 silence_threshold=40):
        if (version != 1 and version != 2):
            raise NameError("version must be 1 or 2")
        self.version = version
        self.spectrogram_path = spectrogram_path
        self.sample_rate = sample_rate
        self.preprocessing = preprocessing
        self.test = test
        self.dump = dump
        self.silence_threshold = silence_threshold

        sig_proc = SignalProcessor(num_channels=1,
                                   sample_rate=self.sample_rate,
                                   norm=True)
        fsig_proc = FramedSignalProcessor(frame_size=1024,
                                          hop_size=128,
                                          origin='future')
        spec_proc = SpectrogramProcessor(frame_size=1024)
        filt_proc = LogarithmicFilteredSpectrogramProcessor(
            filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000)
        processor_pipeline = [sig_proc, fsig_proc, spec_proc, filt_proc]
        self.processor_version2 = SequentialProcessor(processor_pipeline)
Exemplo n.º 4
0
    def __init__(self, **kwargs):
        from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
        from madmom.audio.stft import ShortTimeFourierTransformProcessor
        from madmom.audio.spectrogram import (FilteredSpectrogramProcessor,
                                              LogarithmicSpectrogramProcessor,
                                              SpectrogramDifferenceProcessor)
        from madmom.processors import SequentialProcessor, ParallelProcessor

        # define pre-processing chain
        sig = SignalProcessor(num_channels=1, sample_rate=44100)
        # process the multi-resolution spec & diff in parallel
        multi = ParallelProcessor([])
        for frame_size in [4096]:
            frames = FramedSignalProcessor(frame_size=frame_size, fps=100)
            stft = ShortTimeFourierTransformProcessor(
                window=np.hamming(frame_size))  # caching FFT window
            filt = FilteredSpectrogramProcessor(num_bands=12,
                                                fmin=30,
                                                fmax=16000,
                                                norm_filters=True)
            spec = LogarithmicSpectrogramProcessor(mul=5, add=1)
            #diff = SpectrogramDifferenceProcessor(diff_ratio=0.5, positive_diffs=True, stack_diffs=np.hstack)
            # process each frame size with spec and diff sequentially
            multi.append(SequentialProcessor((frames, stft, filt, spec)))
            #multi.append(SequentialProcessor((frames, stft, filt)))

        # stack the features and processes everything sequentially
        pre_processor = SequentialProcessor((sig, multi, np.hstack))
        super(PianoNoteProcessor, self).__init__(pre_processor)
Exemplo n.º 5
0
    def __init__(self, fs, hopsize_t):
        from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
        from madmom.audio.stft import ShortTimeFourierTransformProcessor
        from madmom.audio.filters import MelFilterbank
        from madmom.audio.spectrogram import (FilteredSpectrogramProcessor,
                                              LogarithmicSpectrogramProcessor)
        # from madmom.features.onsets import _cnn_onset_processor_pad

        # define pre-processing chain
        sig = SignalProcessor(num_channels=1, sample_rate=fs)
        # process the multi-resolution spec in parallel
        frames = FramedSignalProcessor(frame_size=2048,
                                       hopsize=int(fs * hopsize_t))
        stft = ShortTimeFourierTransformProcessor()  # caching FFT window
        filt = FilteredSpectrogramProcessor(filterbank=MelFilterbank,
                                            num_bands=80,
                                            fmin=27.5,
                                            fmax=16000,
                                            norm_filters=True,
                                            unique_filters=False)
        spec = LogarithmicSpectrogramProcessor(log=np.log, add=EPSILON)

        # process each frame size with spec and diff sequentially
        single = SequentialProcessor([frames, stft, filt, spec])

        # pre-processes everything sequentially
        pre_processor = SequentialProcessor([sig, single])

        # instantiate a SequentialProcessor
        super(MadmomMelbankProcessor, self).__init__([pre_processor])
Exemplo n.º 6
0
def create_feature_extraction_pipeline(sr=44100,
                                       frame_sizes=[1024, 2048, 4096],
                                       fps_hz=100.):
    audio_loading = Pipeline([
        ("load_audio", FeatureExtractor(librosa.load, sr=sr, mono=True)),
        ("normalize", FeatureExtractor(librosa.util.normalize, norm=np.inf))
    ])

    sig = SignalProcessor(num_channels=1, sample_rate=sr)
    multi = ParallelProcessor([])
    for frame_size in frame_sizes:
        frames = FramedSignalProcessor(frame_size=frame_size, fps=fps_hz)
        stft = ShortTimeFourierTransformProcessor()  # caching FFT window
        filt = FilteredSpectrogramProcessor(filterbank=LogarithmicFilterbank,
                                            num_bands=12,
                                            fmin=30,
                                            fmax=17000,
                                            norm_filters=True,
                                            unique_filters=True)
        spec = LogarithmicSpectrogramProcessor(log=np.log10, mul=5, add=1)
        diff = SpectrogramDifferenceProcessor(diff_ratio=0.5,
                                              positive_diffs=True,
                                              stack_diffs=np.hstack)
        # process each frame size with spec and diff sequentially
        multi.append(SequentialProcessor([frames, stft, filt, spec, diff]))
    feature_extractor = FeatureExtractor(
        SequentialProcessor([sig, multi, np.hstack]))

    feature_extraction_pipeline = Pipeline([("audio_loading", audio_loading),
                                            ("feature_extractor",
                                             feature_extractor)])
    return feature_extraction_pipeline
Exemplo n.º 7
0
    def spec_from_midi(midi_file):

        sig_proc = SignalProcessor(num_channels=1, sample_rate=spec_params["sample_rate"])
        fsig_proc = FramedSignalProcessor(frame_size=spec_params["frame_size"], fps=spec_params["fps"])
        spec_proc = FilteredSpectrogramProcessor(filterbank=LogarithmicFilterbank, num_bands=12, fmin=60, fmax=6000,
                                                 norm_filters=True, unique_filters=False)
        log_proc = LogarithmicSpectrogramProcessor()
        processor = SequentialProcessor([sig_proc, fsig_proc, spec_proc, log_proc])

        # print(midi_file)
        if not os.path.isfile(midi_file.replace('.mid', '.wav')):
            # render audio file from midi
            render_audio(midi_file, sound_font=SOUND_FONT_PATH)

        # compute spectrogram
        audio_path = midi_file.replace('.mid', '.wav')

        # if the spectrogram doesn't exist it will be computed and stored
        if not os.path.isfile(midi_file.replace('.mid', '.spec.npy')):
            spec = processor.process(audio_path).T
            np.save(midi_file.replace('.mid', '.spec'), spec)
        else:
            spec = np.load(midi_file.replace('.mid', '.spec.npy'))

        return spec
Exemplo n.º 8
0
    def __init__(self, **kwargs):
        # pylint: disable=unused-argument
        from ..audio.signal import SignalProcessor, FramedSignalProcessor
        from ..audio.stft import ShortTimeFourierTransformProcessor
        from ..audio.spectrogram import (FilteredSpectrogramProcessor,
                                         LogarithmicSpectrogramProcessor,
                                         SpectrogramDifferenceProcessor)
        from ..models import NOTES_BRNN
        from ..ml.nn import NeuralNetwork

        # define pre-processing chain
        sig = SignalProcessor(num_channels=1, sample_rate=44100)
        # process the multi-resolution spec & diff in parallel
        multi = ParallelProcessor([])
        for frame_size in [1024, 2048, 4096]:
            frames = FramedSignalProcessor(frame_size=frame_size, fps=100)
            stft = ShortTimeFourierTransformProcessor()  # caching FFT window
            filt = FilteredSpectrogramProcessor(num_bands=12,
                                                fmin=30,
                                                fmax=17000,
                                                norm_filters=True)
            spec = LogarithmicSpectrogramProcessor(mul=5, add=1)
            diff = SpectrogramDifferenceProcessor(diff_ratio=0.5,
                                                  positive_diffs=True,
                                                  stack_diffs=np.hstack)
            # process each frame size with spec and diff sequentially
            multi.append(SequentialProcessor((frames, stft, filt, spec, diff)))
        # stack the features and processes everything sequentially
        pre_processor = SequentialProcessor((sig, multi, np.hstack))

        # process the pre-processed signal with a NN
        nn = NeuralNetwork.load(NOTES_BRNN[0])

        # instantiate a SequentialProcessor
        super(RNNPianoNoteProcessor, self).__init__((pre_processor, nn))
Exemplo n.º 9
0
    def __init__(self, **kwargs):
        # pylint: disable=unused-argument
        from ..audio.signal import SignalProcessor, FramedSignalProcessor
        from ..audio.filters import MelFilterbank
        from ..audio.spectrogram import (FilteredSpectrogramProcessor,
                                         LogarithmicSpectrogramProcessor)
        from ..models import ONSETS_CNN
        from ..ml.nn import NeuralNetwork

        # define pre-processing chain
        sig = SignalProcessor(num_channels=1, sample_rate=44100)
        # process the multi-resolution spec in parallel
        multi = ParallelProcessor([])
        for frame_size in [2048, 1024, 4096]:
            frames = FramedSignalProcessor(frame_size=frame_size, fps=100)
            filt = FilteredSpectrogramProcessor(
                filterbank=MelFilterbank, num_bands=80, fmin=27.5, fmax=16000,
                norm_filters=True, unique_filters=False)
            spec = LogarithmicSpectrogramProcessor(log=np.log, add=EPSILON)
            # process each frame size with spec and diff sequentially
            multi.append(SequentialProcessor((frames, filt, spec)))
        # stack the features (in depth) and pad at beginning and end
        stack = np.dstack
        pad = _cnn_onset_processor_pad
        # pre-processes everything sequentially
        pre_processor = SequentialProcessor((sig, multi, stack, pad))

        # process the pre-processed signal with a NN ensemble
        nn = NeuralNetwork.load(ONSETS_CNN[0])

        # instantiate a SequentialProcessor
        super(CNNOnsetProcessor, self).__init__((pre_processor, nn))
    def __init__(self, fs, hopsize_t):
        from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
        from madmom.audio.stft import ShortTimeFourierTransformProcessor
        from madmom.audio.filters import MelFilterbank
        from madmom.audio.spectrogram import (FilteredSpectrogramProcessor,
                                              LogarithmicSpectrogramProcessor)
        # from madmom.features.onsets import _cnn_onset_processor_pad

        # define pre-processing chain
        sig = SignalProcessor(num_channels=1, sample_rate=fs)
        # process the multi-resolution spec in parallel
        multi = ParallelProcessor([])
        for frame_size in [2048, 1024, 4096]:
            frames = FramedSignalProcessor(frame_size=frame_size, fps=100)
            stft = ShortTimeFourierTransformProcessor()  # caching FFT window
            filt = FilteredSpectrogramProcessor(
                filterbank=MelFilterbank, num_bands=80, fmin=27.5, fmax=16000,
                norm_filters=True, unique_filters=False)
            spec = LogarithmicSpectrogramProcessor(log=np.log, add=EPSILON)
            # process each frame size with spec and diff sequentially
            multi.append(SequentialProcessor([frames, stft, filt, spec]))
        # stack the features (in depth) and pad at beginning and end
        stack = np.dstack
        # pad = _cnn_onset_processor_pad
        # pre-processes everything sequentially
        pre_processor = SequentialProcessor([sig, multi, stack])
        # instantiate a SequentialProcessor
        super(MadmomMelbank3ChannelsProcessor, self).__init__([pre_processor])
Exemplo n.º 11
0
def extract(yt_id):
    beats = SequentialProcessor(
        [RNNBeatProcessor(),
         DBNBeatTrackingProcessor(fps=100)])
    chordrec = SequentialProcessor(
        [CNNChordFeatureProcessor(),
         CRFChordRecognitionProcessor()])
    processMulti = ParallelProcessor([])
    processMulti.append(beats)
    processMulti.append(chordrec)
    beatSync = SequentialProcessor(
        [printTime, processMulti, printTime, arrange, printTime])
    return beatSync('tmp/' + yt_id + '.wav')
Exemplo n.º 12
0
    def __init__(self, online=False, **kwargs):
        # pylint: disable=unused-argument
        from ..audio.signal import SignalProcessor, FramedSignalProcessor
        from ..audio.stft import ShortTimeFourierTransformProcessor
        from ..audio.spectrogram import (FilteredSpectrogramProcessor,
                                         LogarithmicSpectrogramProcessor,
                                         SpectrogramDifferenceProcessor)
        from ..models import ONSETS_RNN, ONSETS_BRNN
        from ..ml.nn import NeuralNetworkEnsemble

        # choose the appropriate models and set frame sizes accordingly
        if online:
            origin = 'online'
            nn_files = ONSETS_RNN
            frame_sizes = [512, 1024, 2048]
        else:
            origin = 'offline'
            nn_files = ONSETS_BRNN
            frame_sizes = [1024, 2048, 4096]

        # define pre-processing chain
        sig = SignalProcessor(num_channels=1, sample_rate=44100)
        # process the multi-resolution spec & diff in parallel
        multi = ParallelProcessor([])
        for frame_size in frame_sizes:
            frames = FramedSignalProcessor(frame_size=frame_size,
                                           fps=100,
                                           origin=origin)
            stft = ShortTimeFourierTransformProcessor()  # caching FFT window
            filt = FilteredSpectrogramProcessor(num_bands=6,
                                                fmin=30,
                                                fmax=17000,
                                                norm_filters=True)
            spec = LogarithmicSpectrogramProcessor(mul=5, add=1)
            diff = SpectrogramDifferenceProcessor(diff_ratio=0.25,
                                                  positive_diffs=True,
                                                  stack_diffs=np.hstack)
            # process each frame size with spec and diff sequentially
            multi.append(SequentialProcessor((frames, stft, filt, spec, diff)))
        # stack the features and processes everything sequentially
        pre_processor = SequentialProcessor((sig, multi, np.hstack))

        # process the pre-processed signal with a NN ensemble
        nn = NeuralNetworkEnsemble.load(nn_files, **kwargs)

        # instantiate a SequentialProcessor
        super(RNNOnsetProcessor, self).__init__((pre_processor, nn))
Exemplo n.º 13
0
def _make_preprocessor(settings, pad):
    from madmom.audio.spectrogram import (
        LogarithmicFilteredSpectrogramProcessor,
        SpectrogramDifferenceProcessor)
    from madmom.audio.filters import LogarithmicFilterbank
    from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
    from madmom.audio.stft import ShortTimeFourierTransformProcessor
    from madmom.processors import SequentialProcessor

    sig = SignalProcessor(num_channels=1, sample_rate=settings['sample_rate'])
    frames = FramedSignalProcessor(frame_size=settings['frame_size'],
                                   fps=settings['fps'])
    stft = ShortTimeFourierTransformProcessor()  # caching FFT window
    spec = LogarithmicFilteredSpectrogramProcessor(
        num_channels=1,
        sample_rate=settings['sample_rate'],
        filterbank=LogarithmicFilterbank,
        frame_size=settings['frame_size'],
        fps=settings['fps'],
        num_bands=settings['num_bands'],
        fmin=settings['fmin'],
        fmax=settings['fmax'],
        norm_filters=settings['norm_filters'])
    if settings['diff']:
        if 'pad' in settings and settings['pad']:
            stack = _crnn_drum_processor_stack
        else:
            stack = np.hstack
        diff = SpectrogramDifferenceProcessor(diff_ratio=0.5,
                                              positive_diffs=True,
                                              stack_diffs=stack)
        # process input data
        if pad > 0:
            pre_processor = SequentialProcessor(
                (sig, frames, stft, spec, diff, PadProcessor(pad)))
        else:
            pre_processor = SequentialProcessor(
                (sig, frames, stft, spec, diff))

    else:
        if pad > 0:
            pre_processor = SequentialProcessor(
                (sig, frames, stft, spec, PadProcessor(pad)))
        else:
            pre_processor = SequentialProcessor((sig, frames, stft, spec))

    return pre_processor
Exemplo n.º 14
0
    def __init__(self, hparams, dataset: FreeSoundAudioDataset):
        super(MadmomFeatureIteratorV2, self).__init__(hparams, dataset)

        if not isinstance(dataset, FreeSoundAudioDataset):
            raise AssertionError("dataset should be FreeSoundAudioDataset")

        sig_proc = SignalProcessor(num_channels=1,
                                   sample_rate=32000,
                                   norm=True)
        fsig_proc = FramedSignalProcessor(frame_size=1024,
                                          hop_size=128,
                                          origin='future')
        spec_proc = SpectrogramProcessor(frame_size=1024)
        filt_proc = LogarithmicFilteredSpectrogramProcessor(
            filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000)
        processor_pipeline2 = [sig_proc, fsig_proc, spec_proc, filt_proc]
        self.processor_version2 = SequentialProcessor(processor_pipeline2)
Exemplo n.º 15
0
def get_chords_processor():
    print('START CHORDS PROCESSOR >> ', str(datetime.now()))
    from madmom.features.chords import CNNChordFeatureProcessor, CRFChordRecognitionProcessor
    from madmom.processors import SequentialProcessor
    print('CHORDS PROCESSOR       >> ', str(datetime.now()))
    return SequentialProcessor(
        [CNNChordFeatureProcessor(),
         CRFChordRecognitionProcessor()])
Exemplo n.º 16
0
def get_beat_processor():
    print('START BEAT PROCESSOR   >> ', str(datetime.now()))
    from madmom.features.beats import RNNBeatProcessor, DBNBeatTrackingProcessor
    from madmom.processors import SequentialProcessor
    print('BEAT PROCESSOR         >> ', str(datetime.now()))
    return SequentialProcessor(
        [RNNBeatProcessor(),
         DBNBeatTrackingProcessor(fps=100)])
Exemplo n.º 17
0
class MadmomFeatureIteratorV2(FreeSoundDataIteratorBase):
    """
    Custom feature extraction using Madmom library pipepline
    Reference: https://github.com/CPJKU/dcase_task2/blob/master/dcase_task2/prepare_spectrograms.py
    """
    def __init__(self, hparams, dataset: FreeSoundAudioDataset):
        super(MadmomFeatureIteratorV2, self).__init__(hparams, dataset)

        if not isinstance(dataset, FreeSoundAudioDataset):
            raise AssertionError("dataset should be FreeSoundAudioDataset")

        sig_proc = SignalProcessor(num_channels=1,
                                   sample_rate=32000,
                                   norm=True)
        fsig_proc = FramedSignalProcessor(frame_size=1024,
                                          hop_size=128,
                                          origin='future')
        spec_proc = SpectrogramProcessor(frame_size=1024)
        filt_proc = LogarithmicFilteredSpectrogramProcessor(
            filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000)
        processor_pipeline2 = [sig_proc, fsig_proc, spec_proc, filt_proc]
        self.processor_version2 = SequentialProcessor(processor_pipeline2)

    @overrides
    def _user_map_func(self, file_path, label):
        """
        Function that maps the audio files into features and labels as on-hot vector
        :param file_path:
        :param label:
        :return:
        """
        data = self.processor_version2.process(file_path)

        label = self._dataset.get_one_hot_encoded(label)
        return data, label

    @overrides
    def _user_resize_func(self, data, label):
        """
        Function that sets up the sizes of the tensor, after execution of `tf.py_func` call
        :param data:
        :param label:
        :return:
        """
        data = tf.reshape(data, shape=[128, 33])
        label = tf.reshape(label, shape=[42])
        return data, label
Exemplo n.º 18
0
def spectrogram_processor(spec_params):
    """Helper function for our spectrogram extraction."""
    sig_proc = SignalProcessor(num_channels=1,
                               sample_rate=spec_params['sample_rate'])
    fsig_proc = FramedSignalProcessor(frame_size=spec_params['frame_size'],
                                      fps=spec_params['fps'])

    spec_proc = FilteredSpectrogramProcessor(filterbank=LogarithmicFilterbank,
                                             num_bands=12,
                                             fmin=60,
                                             fmax=6000,
                                             norm_filters=True,
                                             unique_filters=False)
    log_proc = LogarithmicSpectrogramProcessor()

    processor = SequentialProcessor([sig_proc, fsig_proc, spec_proc, log_proc])

    return processor
Exemplo n.º 19
0
def build_cnn(madmom_processor_filename):
    from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
    from madmom.audio.stft import ShortTimeFourierTransformProcessor
    from madmom.audio.spectrogram import (FilteredSpectrogramProcessor,
                                          LogarithmicSpectrogramProcessor)

    from madmom.ml.nn import NeuralNetworkEnsemble
    # define pre-processing chain
    sig = SignalProcessor(num_channels=1, sample_rate=44100)
    frames = FramedSignalProcessor(frame_size=4096, hop_size=441 * 2)
    stft = ShortTimeFourierTransformProcessor()  # caching FFT window
    filt = FilteredSpectrogramProcessor(num_bands=24, fmin=30, fmax=10000)

    # this is the money param! it was not whitelisted in 'canonicalize_audio_options'!
    spec = LogarithmicSpectrogramProcessor(add=1)
    # pre-processes everything sequentially
    pre_processor = SequentialProcessor([
        sig, frames, stft, filt, spec, _cnn_pad
    ])
    # process the pre-processed signal with a NN
    nn = NeuralNetworkEnsemble.load([madmom_processor_filename])
    return madmom.processors.SequentialProcessor([pre_processor, nn])
    def __init__(self, sr=44100, **kwargs):
        from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
        from madmom.audio.stft import ShortTimeFourierTransformProcessor
        from madmom.audio.spectrogram import (FilteredSpectrogramProcessor,
                                              LogarithmicSpectrogramProcessor)
        from madmom.ml.nn import NeuralNetworkEnsemble
        sr_ratio = 44100 / sr
        # define pre-processing chain
        sig = SignalProcessor(num_channels=1, sample_rate=sr)
        frames = FramedSignalProcessor(frame_size=4096 // sr_ratio,
                                       fps=50 // sr_ratio)
        stft = ShortTimeFourierTransformProcessor()  # caching FFT window
        filt = FilteredSpectrogramProcessor(num_bands=24, fmin=30, fmax=10000)
        spec = LogarithmicSpectrogramProcessor(add=1)
        # pre-processes everything sequentially
        pre_processor = SequentialProcessor(
            (sig, frames, stft, filt, spec, _cnn_pad))
        # process the pre-processed signal with a NN
        nn = NeuralNetworkEnsemble.load(VIENNA_MODEL_PATH)
        # instantiate a SequentialProcessor
        super().__init__((pre_processor, nn))

        self.adsr = ADSRMaestro()
class DcasePredictorProvider(PredictorContract):
    """
    Implementation of a PredictorContract. This class
    makes predictions where spectrograms are considered
    as inputs and a convolutional neural network produces
    class probabilities.

    Attributes
    ----------
    sig_proc : madmom.Processor
        processor which outputs sampled audio signals
    fsig_proc : madmom.Processor
        processor which produces overlapping frames based on sampled signals
    spec_proc : madmom.Processor
        processor which computes a spectrogram with stft based on framed signals
    filt_proc : madmom.Processor
        processor which filters and scales a spectrogram
    processorPipeline : SequentialProcessor
        creates pipeline of elements of type madmom.Processor
    classes : list of str
        class list
    device : str
        indicates the processor to be used for neural network prediction
    prediction_model : baseline_net.Net
        holds a reference to the CNN architecture
    sliding_window : 2d numpy array
        cache for previously calculated spectrograms
    lastProceededGroundTruth : int
        variable to keep track of the last processed audio chunk
    slidingWindowThread:
        reference pointing to the sliding window thread
    predictionThread:
        reference pointing to the prediction thread

    Methods
    -------
    start()
       starts all necessary sub tasks of this predictor.
    stop()
       stops all necessary sub tasks of this predictor.
    computeSpectrogram()
       compute a spectrogram based on the most current audio chunk.
    predict()
       CNN prediction based on current spectrogram input.
    """
    # madmom pipeline for spectrogram calculation
    sig_proc = SignalProcessor(num_channels=1, sample_rate=32000, norm=True)
    fsig_proc = FramedSignalProcessor(frame_size=1024, hop_size=128, origin='future')
    spec_proc = SpectrogramProcessor(frame_size=1024)
    filt_proc = LogarithmicFilteredSpectrogramProcessor(filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000)
    processorPipeline = SequentialProcessor([sig_proc, fsig_proc, spec_proc, filt_proc])

    classes = ["Acoustic_guitar", "Applause", "Bark", "Bass_drum", "Burping_or_eructation", "Bus", "Cello", "Chime",
               "Clarinet", "Computer_keyboard", "Cough", "Cowbell", "Double_bass", "Drawer_open_or_close",
               "Electric_piano",
               "Fart", "Finger_snapping", "Fireworks", "Flute", "Glockenspiel", "Gong", "Gunshot_or_gunfire",
               "Harmonica",
               "Hi-hat", "Keys_jangling", "Knock", "Laughter", "Meow", "Microwave_oven", "Oboe", "Saxophone",
               "Scissors",
               "Shatter", "Snare_drum", "Squeak", "Tambourine", "Tearing", "Telephone", "Trumpet",
               "Violin_or_fiddle",
               "Writing"]

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    def __init__(self, condition):
        """
        Parameters
        ----------
        prediction_model : baseline_net.Net
           holds a reference to the CNN architecture
        sliding_window : 2d numpy array
           cache for previously calculated spectrograms
        lastProceededGroundTruth : int
           variable to keep track of the last processed audio chunk
        """
        # load model with its tuned weight parameters
        self.prediction_model = Net()
        self.prediction_model.load_state_dict(
            torch.load(os.path.join(PROJECT_ROOT,
                                    'server/consumer/predictors/dcase_predictor_provider/baseline_net.pt'),
                       map_location=lambda storage, location: storage))
        self.prediction_model.to(self.device)
        self.prediction_model.eval()

        # sliding window as cache
        self.sliding_window = np.zeros((128, 256), dtype=np.float32)
        self.lastProceededGroundTruth = None
        self.condition = condition

    def start(self):
        """Start all sub tasks necessary for continuous prediction.
        """
        self.slidingWindowThread = SlidingWindowThread(self)
        self.predictionThread = PredictionThread(self)
        self.slidingWindowThread.start()
        self.predictionThread.start()

    def stop(self):
        """Stops all sub tasks
        """
        self.slidingWindowThread.join()
        self.predictionThread.join()

    def computeSpectrogram(self):
        """This methods first access the global time variable ``tGroundTruth``
        and reads audio chunk the time variable points to. Afterwards, the defined
        madmom pipeline is processed to get the spectrogram representation of the
        single chunk. Finally, the sliding window is updated with the new audio chunk.
        """

        t = self.manager.tGroundTruth
        # if thread faster than producer, do not consume same chunk multiple times
        if t != self.lastProceededGroundTruth:
            frame = self.manager.sharedMemory[(t - 1) % BUFFER_SIZE]   # modulo avoids index under/overflow
            frame = np.fromstring(frame, np.int16)
            spectrogram = self.processorPipeline.process(frame)

            frame = spectrogram[0]
            if np.any(np.isnan(frame)):
                frame = np.zeros_like(frame, dtype=np.float32)

            # update sliding window
            self.sliding_window[:, 0:-1] = self.sliding_window[:, 1::]
            self.sliding_window[:, -1] = frame

            self.lastProceededGroundTruth = t

    def predict(self):
        """ This method executes the actual prediction task based on the
        currently available slinding window. The sliding window is sent
        into the CNN model and the correpsonding softmax output for the
        respecive classes are returned

        Returns
        -------
        probs : array of list objects
            an array of number of classes entries where each entry consists of
            the class name, its predicted probability and a position index.
            Example:
            ``[["class1", 0.0006955251446925104, 0], ["class2", 0.0032770668622106314, 1], ...]``
        """

        input = self.sliding_window[np.newaxis, np.newaxis]
        cuda_torch_input = torch.from_numpy(input).to(self.device)
        model_output = self.prediction_model(cuda_torch_input)  # prediction by model
        softmax = nn.Softmax(dim=1)
        softmax_output = softmax(model_output)
        predicts = softmax_output.cpu().detach().numpy().flatten()
        probs = [[elem, predicts[index].item(), index] for index, elem in enumerate(self.classes)]
        return probs
Exemplo n.º 22
0
            # keep spectrogram
            spectrograms.append(np.asarray(spectrogram))

        spectrograms = np.asarray(spectrograms)

        return spectrograms

processor_version1 = LibrosaProcessor()

sig_proc = SignalProcessor(num_channels=1, sample_rate=32000, norm=True)
fsig_proc = FramedSignalProcessor(frame_size=1024, hop_size=128, origin='future')
spec_proc = SpectrogramProcessor(frame_size=1024)
filt_proc = LogarithmicFilteredSpectrogramProcessor(filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000)
processor_pipeline2 = [sig_proc, fsig_proc, spec_proc, filt_proc]
processor_version2 = SequentialProcessor(processor_pipeline2)


if __name__ == "__main__":
    """ main """

    # add argument parser
    parser = argparse.ArgumentParser(description='Pre-compute spectrograms for training and testing.')
    parser.add_argument('--audio_path', help='path to audio files.')
    parser.add_argument('--spec_path', help='path where to store spectrograms.')
    parser.add_argument('--show', help='show spectrogram plots.', type=int, default=None)
    parser.add_argument('--dump', help='dump spectrograms.', action='store_true')
    parser.add_argument('--spec_version', help='spectrogram version to compute (1 or 2).', type=int, default=1)
    parser.add_argument('--no_preprocessing', help='compute spectrogram for original audios.', action='store_true')
    args = parser.parse_args()
class MadmomSpectrogramProvider(VisualisationContract):
    """
    Implementation of a VisualisationContract. This class
    computes new spectrograms based on the most current
    audio chunks which is indicated via ``tGroundTruth``.

    Attributes
    ----------
    sig_proc : madmom.Processor
        processor which outputs sampled audio signals
    fsig_proc : madmom.Processor
        processor which produces overlapping frames based on sampled signals
    spec_proc : madmom.Processor
        processor which computes a spectrogram with stft based on framed signals
    filt_proc : madmom.Processor
        processor which filters and scales a spectrogram
    processorPipeline : SequentialProcessor
        creates pipeline of elements of type madmom.Processor
    sliding_window : 2d numpy array
        cache for previously calculated spectrograms
    lastProceededGroundTruth : int
        variable to keep track of the last processed audio chunk
    visThread:
        reference pointing to the sliding window thread

    Methods
    -------
    start()
       starts all necessary sub tasks of this visualizer.
    stop()
       stops all necessary sub tasks of this visualizer.
    computeSpectrogram()
       compute a spectrogram based on the most current audio chunk.
    """

    # madmom pipeline for spectrogram calculation
    sig_proc = SignalProcessor(num_channels=1, sample_rate=32000, norm=True)
    fsig_proc = FramedSignalProcessor(frame_size=1024, hop_size=128, origin='future')
    spec_proc = SpectrogramProcessor(frame_size=1024)
    filt_proc = LogarithmicFilteredSpectrogramProcessor(filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000)
    processorPipeline = SequentialProcessor([sig_proc, fsig_proc, spec_proc, filt_proc])

    def __init__(self, condition):
        """
        Parameters
        ----------
        sliding_window : 2d numpy array
           cache for previously calculated spectrograms
        lastProceededGroundTruth : int
           variable to keep track of the last processed audio chunk
        """

        # sliding window as cache
        self.sliding_window = np.zeros((128, 256), dtype=np.float32)
        self.lastProceededGroundTruth = None
        self.condition = condition

    def start(self):
        """Start all sub tasks necessary for continuous spectrograms.
        """
        self.visThread = VisualisationThread(self)
        self.visThread.start()

    def stop(self):
        """Stops all sub tasks
        """
        self.visThread.join()

    def computeSpectrogram(self):
        """This methods first access the global time variable ``tGroundTruth``
        and reads audio chunk the time variable points to. Afterwards, the defined
        madmom pipeline is processed to get the spectrogram representation of the
        single chunk. Finally, the sliding window is updated with the new audio chunk
        and a copy of the sliding window is returned to the calling thread.

        Returns
        -------
        sliding_window : 2d numpy array of float values
            returns a copy of the current sliding window spectrogram
        """
        # if thread faster than producer, do not consume same chunk multiple times
        t = self.manager.tGroundTruth
        if t != self.lastProceededGroundTruth:
            frame = self.manager.sharedMemory[(t - 1) % BUFFER_SIZE]   # modulo avoids index under/overflow
            frame = np.fromstring(frame, np.int16)
            spectrogram = self.processorPipeline.process(frame)

            frame = spectrogram[0]
            if np.any(np.isnan(frame)):
                frame = np.zeros_like(frame, dtype=np.float32)

            # update sliding window
            self.sliding_window[:, 0:-1] = self.sliding_window[:, 1::]
            self.sliding_window[:, -1] = frame

            self.lastProceededGroundTruth = t

        return self.sliding_window.copy()
Exemplo n.º 24
0
from madmom.processors import SequentialProcessor

# init signal processing
SAMPLE_RATE = 22050
FRAME_SIZE = 2048
FPS = 20

sig_proc = SignalProcessor(num_channels=1, sample_rate=SAMPLE_RATE)
fsig_proc = FramedSignalProcessor(frame_size=FRAME_SIZE,
                                  fps=FPS,
                                  origin='future')
spec_proc = FilteredSpectrogramProcessor(
    LogarithmicFilterbank, num_bands=16, fmin=30,
    fmax=6000)  # num_bands=24, fmin=30, fmax=8000
log_spec_proc = LogarithmicSpectrogramProcessor()
processor = SequentialProcessor(
    [sig_proc, fsig_proc, spec_proc, log_spec_proc])

colors = ['c', 'm', 'y']


def notes_to_onsets(notes, dt):
    """ Convert sequence of keys to onset frames """

    onsets = []
    for n in notes:
        onset = int(np.ceil(n[0] / dt))
        onsets.append(onset)

    return np.sort(np.asarray(onsets)).astype(np.float32)

class Preprocessor():
    def __init__(self,
                 spectrogram_path=None,
                 version=1,
                 test=False,
                 dump=False,
                 preprocessing=True,
                 sample_rate=32000,
                 silence_threshold=40):
        if (version != 1 and version != 2):
            raise NameError("version must be 1 or 2")
        self.version = version
        self.spectrogram_path = spectrogram_path
        self.sample_rate = sample_rate
        self.preprocessing = preprocessing
        self.test = test
        self.dump = dump
        self.silence_threshold = silence_threshold

        sig_proc = SignalProcessor(num_channels=1,
                                   sample_rate=self.sample_rate,
                                   norm=True)
        fsig_proc = FramedSignalProcessor(frame_size=1024,
                                          hop_size=128,
                                          origin='future')
        spec_proc = SpectrogramProcessor(frame_size=1024)
        filt_proc = LogarithmicFilteredSpectrogramProcessor(
            filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000)
        processor_pipeline = [sig_proc, fsig_proc, spec_proc, filt_proc]
        self.processor_version2 = SequentialProcessor(processor_pipeline)

    def __spectrogram_V1(self, signal, fft_window_size, hop_length,
                         log_spectrogram, n_mels, fmax):
        # compute stft
        stft = librosa.stft(signal,
                            n_fft=fft_window_size,
                            hop_length=hop_length,
                            win_length=None,
                            window='hann',
                            center=True,
                            pad_mode='reflect')
        # keep only magnitude
        stft = np.abs(stft)
        # spectrogram weighting
        if log_spectrogram:
            stft = np.log10(stft + 1)
        else:
            freqs = librosa.core.fft_frequencies(sr=self.sample_rate,
                                                 n_fft=fft_window_size)
            stft = librosa.perceptual_weighting(stft**2,
                                                freqs,
                                                ref=1.0,
                                                amin=1e-10,
                                                top_db=99.0)
        # apply mel filterbank
        spectrogram = librosa.feature.melspectrogram(S=stft,
                                                     sr=self.sample_rate,
                                                     n_mels=n_mels,
                                                     fmax=fmax)

        spectrogram = np.asarray(spectrogram)
        return spectrogram

    def __spectrogram_V2(self, signal):
        spectrogram = self.processor_version2.process(signal)
        return spectrogram

    def normalize_and_trim_silence(self, signal):
        # trim silence at beginning and end and normalize to -0.1
        signal_normalized = librosa.util.normalize(signal, norm=100)
        signal_normalized, index = librosa.effects.trim(
            signal_normalized, top_db=self.silence_threshold)
        return signal_normalized

    def compute_spectrogram(self, signal, file_name=None):
        if (self.dump and file_name == None):
            raise NameError("A file_name must be specified")

        if (self.preprocessing):
            signal = self.normalize_and_trim_silence(signal)

        if (self.version == 1):
            spectrogram = self.__spectrogram_V1(signal,
                                                fft_window_size=1024,
                                                hop_length=192,
                                                log_spectrogram=False,
                                                n_mels=128,
                                                fmax=None)
        else:
            spectrogram = self.__spectrogram_V2(signal)
            spectrogram = np.swapaxes(spectrogram, 0, 1)

        # plot spectrogram
        if self.test:
            print("Spectrogram Shape:", spectrogram.shape)
            plt.figure("General-Purpose ")
            plt.clf()
            plt.subplots_adjust(right=0.98, left=0.1, bottom=0.1, top=0.99)
            plt.imshow(spectrogram,
                       origin="lower",
                       interpolation="nearest",
                       cmap="viridis")
            plt.xlabel("%d frames" % spectrogram.shape[1])
            plt.ylabel("%d bins" % spectrogram.shape[0])
            plt.colorbar()
            plt.show()
            plt.show(block=True)

        if self.dump:
            # save spectrograms
            if not os.path.exists(self.spectrogram_path):
                os.makedirs(self.spectrogram_path)
            spec_file = os.path.join(self.spectrogram_path, file_name)
            np.save(spec_file, spectrogram)

        return spectrogram

    def set_version(self, version):
        self.version = version

    def set_spectrogram_path(self, spectrogram_path):
        self.spectrogram_path = spectrogram_path