def __init__(self, fs, hopsize_t):
        from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
        from madmom.audio.stft import ShortTimeFourierTransformProcessor
        from madmom.audio.filters import MelFilterbank
        from madmom.audio.spectrogram import (FilteredSpectrogramProcessor,
                                              LogarithmicSpectrogramProcessor)
        # from madmom.features.onsets import _cnn_onset_processor_pad

        # define pre-processing chain
        sig = SignalProcessor(num_channels=1, sample_rate=fs)
        # process the multi-resolution spec in parallel
        multi = ParallelProcessor([])
        for frame_size in [2048, 1024, 4096]:
            frames = FramedSignalProcessor(frame_size=frame_size, fps=100)
            stft = ShortTimeFourierTransformProcessor()  # caching FFT window
            filt = FilteredSpectrogramProcessor(
                filterbank=MelFilterbank, num_bands=80, fmin=27.5, fmax=16000,
                norm_filters=True, unique_filters=False)
            spec = LogarithmicSpectrogramProcessor(log=np.log, add=EPSILON)
            # process each frame size with spec and diff sequentially
            multi.append(SequentialProcessor([frames, stft, filt, spec]))
        # stack the features (in depth) and pad at beginning and end
        stack = np.dstack
        # pad = _cnn_onset_processor_pad
        # pre-processes everything sequentially
        pre_processor = SequentialProcessor([sig, multi, stack])
        # instantiate a SequentialProcessor
        super(MadmomMelbank3ChannelsProcessor, self).__init__([pre_processor])
Пример #2
0
    def __init__(self, fs, hopsize_t):
        from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
        from madmom.audio.stft import ShortTimeFourierTransformProcessor
        from madmom.audio.filters import MelFilterbank
        from madmom.audio.spectrogram import (FilteredSpectrogramProcessor,
                                              LogarithmicSpectrogramProcessor)
        # from madmom.features.onsets import _cnn_onset_processor_pad

        # define pre-processing chain
        sig = SignalProcessor(num_channels=1, sample_rate=fs)
        # process the multi-resolution spec in parallel
        frames = FramedSignalProcessor(frame_size=2048,
                                       hopsize=int(fs * hopsize_t))
        stft = ShortTimeFourierTransformProcessor()  # caching FFT window
        filt = FilteredSpectrogramProcessor(filterbank=MelFilterbank,
                                            num_bands=80,
                                            fmin=27.5,
                                            fmax=16000,
                                            norm_filters=True,
                                            unique_filters=False)
        spec = LogarithmicSpectrogramProcessor(log=np.log, add=EPSILON)

        # process each frame size with spec and diff sequentially
        single = SequentialProcessor([frames, stft, filt, spec])

        # pre-processes everything sequentially
        pre_processor = SequentialProcessor([sig, single])

        # instantiate a SequentialProcessor
        super(MadmomMelbankProcessor, self).__init__([pre_processor])
    def __init__(self, fs, hopsize_t):
        from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
        from madmom.audio.stft import ShortTimeFourierTransformProcessor
        from madmom.audio.filters import MelFilterbank
        from madmom.audio.spectrogram import (FilteredSpectrogramProcessor,
                                              LogarithmicSpectrogramProcessor)

        # define pre-processing chain
        sig = SignalProcessor(num_channels=1, sample_rate=fs)
        frames = FramedSignalProcessor(frame_size=2048,
                                       hopsize=int(fs * hopsize_t))
        stft = ShortTimeFourierTransformProcessor()  # caching FFT window
        filt = FilteredSpectrogramProcessor(filterbank=MelFilterbank,
                                            num_bands=80,
                                            fmin=27.5,
                                            fmax=16000,
                                            norm_filters=True,
                                            unique_filters=False)
        spec = LogarithmicSpectrogramProcessor(log=np.log, add=EPSILON)

        single = SequentialProcessor([frames, stft, filt, spec])

        pre_processor = SequentialProcessor([sig, single])

        super(MadmomMelbankProcessor, self).__init__([pre_processor])
Пример #4
0
def CreateProcesser(fps=100):
    # define pre-processing chain
    sig = SignalProcessor(num_channels=1, sample_rate=44100)
    # process the multi-resolution spec & diff in parallel
    # process the multi-resolution spec & diff in parallel
    multi = ParallelProcessor([])
    frame_sizes = [1024, 2048, 4096]
    num_bands = [3, 6, 12]
    for frame_size, num_bands in zip(frame_sizes, num_bands):
        frames = FramedSignalProcessor(frame_size=frame_size, fps=fps)
        stft = ShortTimeFourierTransformProcessor()  # caching FFT window
        filt = FilteredSpectrogramProcessor(num_bands=num_bands,
                                            fmin=30,
                                            fmax=17000,
                                            norm_filters=True)
        spec = LogarithmicSpectrogramProcessor(mul=1, add=1)
        diff = SpectrogramDifferenceProcessor(diff_ratio=0.5,
                                              positive_diffs=True,
                                              stack_diffs=np.hstack)
        # process each frame size with spec and diff sequentially
        multi.append(SequentialProcessor((frames, stft, filt, spec, diff)))

    # stack the features and processes everything sequentially
    pre_processor = SequentialProcessor((sig, multi, np.hstack))
    return pre_processor
Пример #5
0
    def __init__(self, **kwargs):
        from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
        from madmom.audio.stft import ShortTimeFourierTransformProcessor
        from madmom.audio.spectrogram import (FilteredSpectrogramProcessor,
                                              LogarithmicSpectrogramProcessor,
                                              SpectrogramDifferenceProcessor)
        from madmom.processors import SequentialProcessor, ParallelProcessor

        # define pre-processing chain
        sig = SignalProcessor(num_channels=1, sample_rate=44100)
        # process the multi-resolution spec & diff in parallel
        multi = ParallelProcessor([])
        for frame_size in [4096]:
            frames = FramedSignalProcessor(frame_size=frame_size, fps=100)
            stft = ShortTimeFourierTransformProcessor(
                window=np.hamming(frame_size))  # caching FFT window
            filt = FilteredSpectrogramProcessor(num_bands=12,
                                                fmin=30,
                                                fmax=16000,
                                                norm_filters=True)
            spec = LogarithmicSpectrogramProcessor(mul=5, add=1)
            #diff = SpectrogramDifferenceProcessor(diff_ratio=0.5, positive_diffs=True, stack_diffs=np.hstack)
            # process each frame size with spec and diff sequentially
            multi.append(SequentialProcessor((frames, stft, filt, spec)))
            #multi.append(SequentialProcessor((frames, stft, filt)))

        # stack the features and processes everything sequentially
        pre_processor = SequentialProcessor((sig, multi, np.hstack))
        super(PianoNoteProcessor, self).__init__(pre_processor)
Пример #6
0
    def spec_from_midi(midi_file):

        sig_proc = SignalProcessor(num_channels=1, sample_rate=spec_params["sample_rate"])
        fsig_proc = FramedSignalProcessor(frame_size=spec_params["frame_size"], fps=spec_params["fps"])
        spec_proc = FilteredSpectrogramProcessor(filterbank=LogarithmicFilterbank, num_bands=12, fmin=60, fmax=6000,
                                                 norm_filters=True, unique_filters=False)
        log_proc = LogarithmicSpectrogramProcessor()
        processor = SequentialProcessor([sig_proc, fsig_proc, spec_proc, log_proc])

        # print(midi_file)
        if not os.path.isfile(midi_file.replace('.mid', '.wav')):
            # render audio file from midi
            render_audio(midi_file, sound_font=SOUND_FONT_PATH)

        # compute spectrogram
        audio_path = midi_file.replace('.mid', '.wav')

        # if the spectrogram doesn't exist it will be computed and stored
        if not os.path.isfile(midi_file.replace('.mid', '.spec.npy')):
            spec = processor.process(audio_path).T
            np.save(midi_file.replace('.mid', '.spec'), spec)
        else:
            spec = np.load(midi_file.replace('.mid', '.spec.npy'))

        return spec
Пример #7
0
 def __init__(self,
              sample_rate=44100,
              filter_length=8192,
              hop_length=8820,
              win_length=None,
              num_bands=24,
              fmin=65,
              fmax=2100,
              unique_filters=True):
     super(LMLFSpectrogram, self).__init__()
     self.stft = STFT(filter_length, hop_length, win_length)
     # filterbank from madmom
     fname = 'lmlf.wav'
     sf.write(fname, np.random.uniform(-1, 1, 100000), sample_rate)
     _sig = SignalProcessor(num_channels=1, sample_rate=sample_rate)
     _frames = FramedSignalProcessor(frame_size=filter_length,
                                     fps=sample_rate / hop_length)
     _stft = ShortTimeFourierTransformProcessor()  # caching FFT window
     _spec = LogarithmicFilteredSpectrogramProcessor(
         num_bands=num_bands,
         fmin=fmin,
         fmax=fmax,
         unique_filters=unique_filters)
     _spec(_stft(_frames(_sig(fname))))
     os.remove(fname)
     self.filterbank = torch.FloatTensor(np.asarray(_spec.filterbank))
    def __init__(self,
                 spectrogram_path=None,
                 version=1,
                 test=False,
                 dump=False,
                 preprocessing=True,
                 sample_rate=32000,
                 silence_threshold=40):
        if (version != 1 and version != 2):
            raise NameError("version must be 1 or 2")
        self.version = version
        self.spectrogram_path = spectrogram_path
        self.sample_rate = sample_rate
        self.preprocessing = preprocessing
        self.test = test
        self.dump = dump
        self.silence_threshold = silence_threshold

        sig_proc = SignalProcessor(num_channels=1,
                                   sample_rate=self.sample_rate,
                                   norm=True)
        fsig_proc = FramedSignalProcessor(frame_size=1024,
                                          hop_size=128,
                                          origin='future')
        spec_proc = SpectrogramProcessor(frame_size=1024)
        filt_proc = LogarithmicFilteredSpectrogramProcessor(
            filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000)
        processor_pipeline = [sig_proc, fsig_proc, spec_proc, filt_proc]
        self.processor_version2 = SequentialProcessor(processor_pipeline)
Пример #9
0
def frame(wav):
    proc = FramedSignalProcessor(frame_size=4096, fps=100)
    frames = proc(wav)
    frame_time = []
    for i in range(frames.num_frames):
        frame_time.append(0.01 * i)
    frame_time = np.array(frame_time)
    return frame_time
Пример #10
0
def _make_preprocessor(settings, pad):
    from madmom.audio.spectrogram import (
        LogarithmicFilteredSpectrogramProcessor,
        SpectrogramDifferenceProcessor)
    from madmom.audio.filters import LogarithmicFilterbank
    from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
    from madmom.audio.stft import ShortTimeFourierTransformProcessor
    from madmom.processors import SequentialProcessor

    sig = SignalProcessor(num_channels=1, sample_rate=settings['sample_rate'])
    frames = FramedSignalProcessor(frame_size=settings['frame_size'],
                                   fps=settings['fps'])
    stft = ShortTimeFourierTransformProcessor()  # caching FFT window
    spec = LogarithmicFilteredSpectrogramProcessor(
        num_channels=1,
        sample_rate=settings['sample_rate'],
        filterbank=LogarithmicFilterbank,
        frame_size=settings['frame_size'],
        fps=settings['fps'],
        num_bands=settings['num_bands'],
        fmin=settings['fmin'],
        fmax=settings['fmax'],
        norm_filters=settings['norm_filters'])
    if settings['diff']:
        if 'pad' in settings and settings['pad']:
            stack = _crnn_drum_processor_stack
        else:
            stack = np.hstack
        diff = SpectrogramDifferenceProcessor(diff_ratio=0.5,
                                              positive_diffs=True,
                                              stack_diffs=stack)
        # process input data
        if pad > 0:
            pre_processor = SequentialProcessor(
                (sig, frames, stft, spec, diff, PadProcessor(pad)))
        else:
            pre_processor = SequentialProcessor(
                (sig, frames, stft, spec, diff))

    else:
        if pad > 0:
            pre_processor = SequentialProcessor(
                (sig, frames, stft, spec, PadProcessor(pad)))
        else:
            pre_processor = SequentialProcessor((sig, frames, stft, spec))

    return pre_processor
Пример #11
0
    def __init__(self, hparams, dataset: FreeSoundAudioDataset):
        super(MadmomFeatureIteratorV2, self).__init__(hparams, dataset)

        if not isinstance(dataset, FreeSoundAudioDataset):
            raise AssertionError("dataset should be FreeSoundAudioDataset")

        sig_proc = SignalProcessor(num_channels=1,
                                   sample_rate=32000,
                                   norm=True)
        fsig_proc = FramedSignalProcessor(frame_size=1024,
                                          hop_size=128,
                                          origin='future')
        spec_proc = SpectrogramProcessor(frame_size=1024)
        filt_proc = LogarithmicFilteredSpectrogramProcessor(
            filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000)
        processor_pipeline2 = [sig_proc, fsig_proc, spec_proc, filt_proc]
        self.processor_version2 = SequentialProcessor(processor_pipeline2)
Пример #12
0
def spectrogram_processor(spec_params):
    """Helper function for our spectrogram extraction."""
    sig_proc = SignalProcessor(num_channels=1,
                               sample_rate=spec_params['sample_rate'])
    fsig_proc = FramedSignalProcessor(frame_size=spec_params['frame_size'],
                                      fps=spec_params['fps'])

    spec_proc = FilteredSpectrogramProcessor(filterbank=LogarithmicFilterbank,
                                             num_bands=12,
                                             fmin=60,
                                             fmax=6000,
                                             norm_filters=True,
                                             unique_filters=False)
    log_proc = LogarithmicSpectrogramProcessor()

    processor = SequentialProcessor([sig_proc, fsig_proc, spec_proc, log_proc])

    return processor
Пример #13
0
def build_cnn(madmom_processor_filename):
    from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
    from madmom.audio.stft import ShortTimeFourierTransformProcessor
    from madmom.audio.spectrogram import (FilteredSpectrogramProcessor,
                                          LogarithmicSpectrogramProcessor)

    from madmom.ml.nn import NeuralNetworkEnsemble
    # define pre-processing chain
    sig = SignalProcessor(num_channels=1, sample_rate=44100)
    frames = FramedSignalProcessor(frame_size=4096, hop_size=441 * 2)
    stft = ShortTimeFourierTransformProcessor()  # caching FFT window
    filt = FilteredSpectrogramProcessor(num_bands=24, fmin=30, fmax=10000)

    # this is the money param! it was not whitelisted in 'canonicalize_audio_options'!
    spec = LogarithmicSpectrogramProcessor(add=1)
    # pre-processes everything sequentially
    pre_processor = SequentialProcessor([
        sig, frames, stft, filt, spec, _cnn_pad
    ])
    # process the pre-processed signal with a NN
    nn = NeuralNetworkEnsemble.load([madmom_processor_filename])
    return madmom.processors.SequentialProcessor([pre_processor, nn])
    def __init__(self, sr=44100, **kwargs):
        from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
        from madmom.audio.stft import ShortTimeFourierTransformProcessor
        from madmom.audio.spectrogram import (FilteredSpectrogramProcessor,
                                              LogarithmicSpectrogramProcessor)
        from madmom.ml.nn import NeuralNetworkEnsemble
        sr_ratio = 44100 / sr
        # define pre-processing chain
        sig = SignalProcessor(num_channels=1, sample_rate=sr)
        frames = FramedSignalProcessor(frame_size=4096 // sr_ratio,
                                       fps=50 // sr_ratio)
        stft = ShortTimeFourierTransformProcessor()  # caching FFT window
        filt = FilteredSpectrogramProcessor(num_bands=24, fmin=30, fmax=10000)
        spec = LogarithmicSpectrogramProcessor(add=1)
        # pre-processes everything sequentially
        pre_processor = SequentialProcessor(
            (sig, frames, stft, filt, spec, _cnn_pad))
        # process the pre-processed signal with a NN
        nn = NeuralNetworkEnsemble.load(VIENNA_MODEL_PATH)
        # instantiate a SequentialProcessor
        super().__init__((pre_processor, nn))

        self.adsr = ADSRMaestro()
class DcasePredictorProvider(PredictorContract):
    """
    Implementation of a PredictorContract. This class
    makes predictions where spectrograms are considered
    as inputs and a convolutional neural network produces
    class probabilities.

    Attributes
    ----------
    sig_proc : madmom.Processor
        processor which outputs sampled audio signals
    fsig_proc : madmom.Processor
        processor which produces overlapping frames based on sampled signals
    spec_proc : madmom.Processor
        processor which computes a spectrogram with stft based on framed signals
    filt_proc : madmom.Processor
        processor which filters and scales a spectrogram
    processorPipeline : SequentialProcessor
        creates pipeline of elements of type madmom.Processor
    classes : list of str
        class list
    device : str
        indicates the processor to be used for neural network prediction
    prediction_model : baseline_net.Net
        holds a reference to the CNN architecture
    sliding_window : 2d numpy array
        cache for previously calculated spectrograms
    lastProceededGroundTruth : int
        variable to keep track of the last processed audio chunk
    slidingWindowThread:
        reference pointing to the sliding window thread
    predictionThread:
        reference pointing to the prediction thread

    Methods
    -------
    start()
       starts all necessary sub tasks of this predictor.
    stop()
       stops all necessary sub tasks of this predictor.
    computeSpectrogram()
       compute a spectrogram based on the most current audio chunk.
    predict()
       CNN prediction based on current spectrogram input.
    """
    # madmom pipeline for spectrogram calculation
    sig_proc = SignalProcessor(num_channels=1, sample_rate=32000, norm=True)
    fsig_proc = FramedSignalProcessor(frame_size=1024, hop_size=128, origin='future')
    spec_proc = SpectrogramProcessor(frame_size=1024)
    filt_proc = LogarithmicFilteredSpectrogramProcessor(filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000)
    processorPipeline = SequentialProcessor([sig_proc, fsig_proc, spec_proc, filt_proc])

    classes = ["Acoustic_guitar", "Applause", "Bark", "Bass_drum", "Burping_or_eructation", "Bus", "Cello", "Chime",
               "Clarinet", "Computer_keyboard", "Cough", "Cowbell", "Double_bass", "Drawer_open_or_close",
               "Electric_piano",
               "Fart", "Finger_snapping", "Fireworks", "Flute", "Glockenspiel", "Gong", "Gunshot_or_gunfire",
               "Harmonica",
               "Hi-hat", "Keys_jangling", "Knock", "Laughter", "Meow", "Microwave_oven", "Oboe", "Saxophone",
               "Scissors",
               "Shatter", "Snare_drum", "Squeak", "Tambourine", "Tearing", "Telephone", "Trumpet",
               "Violin_or_fiddle",
               "Writing"]

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    def __init__(self, condition):
        """
        Parameters
        ----------
        prediction_model : baseline_net.Net
           holds a reference to the CNN architecture
        sliding_window : 2d numpy array
           cache for previously calculated spectrograms
        lastProceededGroundTruth : int
           variable to keep track of the last processed audio chunk
        """
        # load model with its tuned weight parameters
        self.prediction_model = Net()
        self.prediction_model.load_state_dict(
            torch.load(os.path.join(PROJECT_ROOT,
                                    'server/consumer/predictors/dcase_predictor_provider/baseline_net.pt'),
                       map_location=lambda storage, location: storage))
        self.prediction_model.to(self.device)
        self.prediction_model.eval()

        # sliding window as cache
        self.sliding_window = np.zeros((128, 256), dtype=np.float32)
        self.lastProceededGroundTruth = None
        self.condition = condition

    def start(self):
        """Start all sub tasks necessary for continuous prediction.
        """
        self.slidingWindowThread = SlidingWindowThread(self)
        self.predictionThread = PredictionThread(self)
        self.slidingWindowThread.start()
        self.predictionThread.start()

    def stop(self):
        """Stops all sub tasks
        """
        self.slidingWindowThread.join()
        self.predictionThread.join()

    def computeSpectrogram(self):
        """This methods first access the global time variable ``tGroundTruth``
        and reads audio chunk the time variable points to. Afterwards, the defined
        madmom pipeline is processed to get the spectrogram representation of the
        single chunk. Finally, the sliding window is updated with the new audio chunk.
        """

        t = self.manager.tGroundTruth
        # if thread faster than producer, do not consume same chunk multiple times
        if t != self.lastProceededGroundTruth:
            frame = self.manager.sharedMemory[(t - 1) % BUFFER_SIZE]   # modulo avoids index under/overflow
            frame = np.fromstring(frame, np.int16)
            spectrogram = self.processorPipeline.process(frame)

            frame = spectrogram[0]
            if np.any(np.isnan(frame)):
                frame = np.zeros_like(frame, dtype=np.float32)

            # update sliding window
            self.sliding_window[:, 0:-1] = self.sliding_window[:, 1::]
            self.sliding_window[:, -1] = frame

            self.lastProceededGroundTruth = t

    def predict(self):
        """ This method executes the actual prediction task based on the
        currently available slinding window. The sliding window is sent
        into the CNN model and the correpsonding softmax output for the
        respecive classes are returned

        Returns
        -------
        probs : array of list objects
            an array of number of classes entries where each entry consists of
            the class name, its predicted probability and a position index.
            Example:
            ``[["class1", 0.0006955251446925104, 0], ["class2", 0.0032770668622106314, 1], ...]``
        """

        input = self.sliding_window[np.newaxis, np.newaxis]
        cuda_torch_input = torch.from_numpy(input).to(self.device)
        model_output = self.prediction_model(cuda_torch_input)  # prediction by model
        softmax = nn.Softmax(dim=1)
        softmax_output = softmax(model_output)
        predicts = softmax_output.cpu().detach().numpy().flatten()
        probs = [[elem, predicts[index].item(), index] for index, elem in enumerate(self.classes)]
        return probs
Пример #16
0
                stft = librosa.perceptual_weighting(stft**2, freqs, ref=1.0, amin=1e-10, top_db=99.0)

            # apply mel filterbank
            spectrogram = librosa.feature.melspectrogram(S=stft, sr=sr, n_mels=n_mels, fmax=fmax)

            # keep spectrogram
            spectrograms.append(np.asarray(spectrogram))

        spectrograms = np.asarray(spectrograms)

        return spectrograms

processor_version1 = LibrosaProcessor()

sig_proc = SignalProcessor(num_channels=1, sample_rate=32000, norm=True)
fsig_proc = FramedSignalProcessor(frame_size=1024, hop_size=128, origin='future')
spec_proc = SpectrogramProcessor(frame_size=1024)
filt_proc = LogarithmicFilteredSpectrogramProcessor(filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000)
processor_pipeline2 = [sig_proc, fsig_proc, spec_proc, filt_proc]
processor_version2 = SequentialProcessor(processor_pipeline2)


if __name__ == "__main__":
    """ main """

    # add argument parser
    parser = argparse.ArgumentParser(description='Pre-compute spectrograms for training and testing.')
    parser.add_argument('--audio_path', help='path to audio files.')
    parser.add_argument('--spec_path', help='path where to store spectrograms.')
    parser.add_argument('--show', help='show spectrogram plots.', type=int, default=None)
    parser.add_argument('--dump', help='dump spectrograms.', action='store_true')
class MadmomSpectrogramProvider(VisualisationContract):
    """
    Implementation of a VisualisationContract. This class
    computes new spectrograms based on the most current
    audio chunks which is indicated via ``tGroundTruth``.

    Attributes
    ----------
    sig_proc : madmom.Processor
        processor which outputs sampled audio signals
    fsig_proc : madmom.Processor
        processor which produces overlapping frames based on sampled signals
    spec_proc : madmom.Processor
        processor which computes a spectrogram with stft based on framed signals
    filt_proc : madmom.Processor
        processor which filters and scales a spectrogram
    processorPipeline : SequentialProcessor
        creates pipeline of elements of type madmom.Processor
    sliding_window : 2d numpy array
        cache for previously calculated spectrograms
    lastProceededGroundTruth : int
        variable to keep track of the last processed audio chunk
    visThread:
        reference pointing to the sliding window thread

    Methods
    -------
    start()
       starts all necessary sub tasks of this visualizer.
    stop()
       stops all necessary sub tasks of this visualizer.
    computeSpectrogram()
       compute a spectrogram based on the most current audio chunk.
    """

    # madmom pipeline for spectrogram calculation
    sig_proc = SignalProcessor(num_channels=1, sample_rate=32000, norm=True)
    fsig_proc = FramedSignalProcessor(frame_size=1024, hop_size=128, origin='future')
    spec_proc = SpectrogramProcessor(frame_size=1024)
    filt_proc = LogarithmicFilteredSpectrogramProcessor(filterbank=LogFilterbank, num_bands=26, fmin=20, fmax=14000)
    processorPipeline = SequentialProcessor([sig_proc, fsig_proc, spec_proc, filt_proc])

    def __init__(self, condition):
        """
        Parameters
        ----------
        sliding_window : 2d numpy array
           cache for previously calculated spectrograms
        lastProceededGroundTruth : int
           variable to keep track of the last processed audio chunk
        """

        # sliding window as cache
        self.sliding_window = np.zeros((128, 256), dtype=np.float32)
        self.lastProceededGroundTruth = None
        self.condition = condition

    def start(self):
        """Start all sub tasks necessary for continuous spectrograms.
        """
        self.visThread = VisualisationThread(self)
        self.visThread.start()

    def stop(self):
        """Stops all sub tasks
        """
        self.visThread.join()

    def computeSpectrogram(self):
        """This methods first access the global time variable ``tGroundTruth``
        and reads audio chunk the time variable points to. Afterwards, the defined
        madmom pipeline is processed to get the spectrogram representation of the
        single chunk. Finally, the sliding window is updated with the new audio chunk
        and a copy of the sliding window is returned to the calling thread.

        Returns
        -------
        sliding_window : 2d numpy array of float values
            returns a copy of the current sliding window spectrogram
        """
        # if thread faster than producer, do not consume same chunk multiple times
        t = self.manager.tGroundTruth
        if t != self.lastProceededGroundTruth:
            frame = self.manager.sharedMemory[(t - 1) % BUFFER_SIZE]   # modulo avoids index under/overflow
            frame = np.fromstring(frame, np.int16)
            spectrogram = self.processorPipeline.process(frame)

            frame = spectrogram[0]
            if np.any(np.isnan(frame)):
                frame = np.zeros_like(frame, dtype=np.float32)

            # update sliding window
            self.sliding_window[:, 0:-1] = self.sliding_window[:, 1::]
            self.sliding_window[:, -1] = frame

            self.lastProceededGroundTruth = t

        return self.sliding_window.copy()
Пример #18
0
import madmom.utils.midi as mm_midi
# import madmom.utils.midi_old as mm_midi
from madmom.audio.signal import SignalProcessor, FramedSignalProcessor
from madmom.audio.filters import LogarithmicFilterbank
from madmom.audio.spectrogram import FilteredSpectrogramProcessor, LogarithmicSpectrogramProcessor
from madmom.processors import SequentialProcessor

# init signal processing
SAMPLE_RATE = 22050
FRAME_SIZE = 2048
FPS = 20

sig_proc = SignalProcessor(num_channels=1, sample_rate=SAMPLE_RATE)
fsig_proc = FramedSignalProcessor(frame_size=FRAME_SIZE,
                                  fps=FPS,
                                  origin='future')
spec_proc = FilteredSpectrogramProcessor(
    LogarithmicFilterbank, num_bands=16, fmin=30,
    fmax=6000)  # num_bands=24, fmin=30, fmax=8000
log_spec_proc = LogarithmicSpectrogramProcessor()
processor = SequentialProcessor(
    [sig_proc, fsig_proc, spec_proc, log_spec_proc])

colors = ['c', 'm', 'y']


def notes_to_onsets(notes, dt):
    """ Convert sequence of keys to onset frames """

    onsets = []
Пример #19
0
    num_bands = 24
    fmin = 65
    fmax = 2100

    # torch
    torch_lmlf = LMLFSpectrogram(sample_rate=sr,
                                 filter_length=filter_length,
                                 hop_length=hop_length,
                                 num_bands=num_bands,
                                 fmin=65,
                                 fmax=2100)
    lmlf = torch_lmlf(real_wave.unsqueeze(0))

    # madmom
    _sig = SignalProcessor(num_channels=1, sample_rate=sr)
    _frames = FramedSignalProcessor(frame_size=filter_length,
                                    fps=sr / hop_length)
    _stft = ShortTimeFourierTransformProcessor()  # caching FFT window
    _spec = LogarithmicFilteredSpectrogramProcessor(num_bands=num_bands,
                                                    fmin=fmin,
                                                    fmax=fmax)
    sig = _sig(librosa.util.example_audio_file())
    frames = _frames(sig)
    stft = _stft(frames)
    spec = _spec(stft)

    diff = np.mean(np.abs(lmlf.squeeze(0).numpy() - spec))
    print('===== log-magnitude log-frequency spectrogram =====')
    print('mean difference between outputs from torch and madmom : ', diff)
    print('shape : ', lmlf.shape)

    # ! Mel-spectrogram