示例#1
0
def spectrogram(samples, sample_rate, frame_len, fps, batch=50):
    """
    Computes a magnitude spectrogram for a given vector of samples at a given
    sample rate (in Hz), frame length (in samples) and frame rate (in Hz).
    Allows to transform multiple frames at once for improved performance (with
    a default value of 50, more is not always better). Returns a numpy array.
    """
    if len(samples) < frame_len:
        return np.empty((0, frame_len // 2 + 1), dtype=samples.dtype)
    win = np.hanning(frame_len)
    hopsize = sample_rate // fps
    num_frames = max(0, (len(samples) - frame_len) // hopsize + 1)
    batch = min(batch, num_frames)
    if batch <= 1 or not samples.flags.c_contiguous:
        rfft = rfft_builder(samples[:frame_len], n=frame_len)
        spect = np.vstack(np.abs(rfft(samples[pos:pos + frame_len] * win))
                          for pos in range(0, len(samples) - frame_len + 1,
                                           int(hopsize)))
    else:
        rfft = rfft_builder(np.empty((batch, frame_len), samples.dtype),
                            n=frame_len, threads=1)
        frames = np.lib.stride_tricks.as_strided(
                samples, shape=(num_frames, frame_len),
                strides=(samples.strides[0] * hopsize, samples.strides[0]))
        spect = [np.abs(rfft(frames[pos:pos + batch] * win))
                 for pos in range(0, num_frames - batch + 1, batch)]
        if num_frames % batch:
            spect.extend(spectrogram(
                    samples[(num_frames // batch * batch) * hopsize:],
                    sample_rate, frame_len, fps, batch=1))
        spect = np.vstack(spect)
        
    return spect
示例#2
0
def spectrogram_plans(frame_len, batch=48, dtype=np.float32):
    """
    Precompute plans for spectrogram(), for a given frame length, batch size
    and dtype. Returns two plans (single spectrum and batch), and a window.
    """
    input_array = empty_aligned((batch, frame_len), dtype=dtype)
    win = np.hanning(frame_len).astype(dtype)
    return (rfft_builder(input_array[0]), rfft_builder(input_array), win)
示例#3
0
文件: stft.py 项目: Alewep/REETM
    def __new__(cls, frames, window=np.hanning, fft_size=None,
                circular_shift=False, include_nyquist=False, fft_window=None,
                fftw=None, **kwargs):
        # pylint: disable=unused-argument
        if isinstance(frames, ShortTimeFourierTransform):
            # already a STFT, use the frames thereof
            frames = frames.frames
        # instantiate a FramedSignal if needed
        if not isinstance(frames, FramedSignal):
            frames = FramedSignal(frames, **kwargs)

        # size of the frames
        frame_size = frames.shape[1]

        if fft_window is None:
            # if a callable window function is given, use the frame size to
            # create a window of this size
            if hasattr(window, '__call__'):
                window = window(frame_size)
            # window used for FFT
            try:
                # if the signal is not scaled, scale the window accordingly
                max_range = float(np.iinfo(frames.signal.dtype).max)
                try:
                    # scale the window by the max_range
                    fft_window = window / max_range
                except TypeError:
                    # if the window is None we can't scale it, thus create a
                    # uniform window and scale it accordingly
                    fft_window = np.ones(frame_size) / max_range
            except ValueError:
                # no scaling needed, use the window as is (can also be None)
                fft_window = window

        # use FFTW to speed up STFT
        try:
            # Note: use fft_window instead of a frame because it has already
            #       the correct dtype (frames are multiplied with this window)
            fftw = rfft_builder(fft_window, fft_size, axis=0)
        except AttributeError:
            pass
        # calculate the STFT
        data = stft(frames, fft_window, fft_size=fft_size,
                    circular_shift=circular_shift,
                    include_nyquist=include_nyquist, fftw=fftw)

        # cast as ShortTimeFourierTransform
        obj = np.asarray(data).view(cls)
        # save the other parameters
        obj.frames = frames
        obj.window = window
        obj.fft_window = fft_window
        obj.fft_size = fft_size if fft_size else frame_size
        obj.circular_shift = circular_shift
        obj.include_nyquist = include_nyquist
        # return the object
        return obj
示例#4
0
def filtered_stft(samples, frame_len, hop_size, filterbank):
    """
    Computes an STFT, applying a filterbank on the way to minimize memory use.
    """
    window = np.hanning(frame_len)
    rfft = rfft_builder(samples[:frame_len], n=frame_len)
    spect = np.vstack(
        np.dot(
            np.abs(rfft(samples[pos:pos + frame_len] *
                        window))[:len(filterbank)], filterbank)
        for pos in range(0,
                         len(samples) - frame_len + 1, hop_size))
    return spect
示例#5
0
def spectrogram_partial(samples,
                        sample_rate,
                        frame_len,
                        fps,
                        save_input,
                        dump_path,
                        batch=50):
    """
    Computes a magnitude spectrogram for a given vector of samples at a given
    sample rate (in Hz), frame length (in samples) and frame rate (in Hz).
    Allows to transform multiple frames at once for improved performance (with
    a default value of 50, more is not always better). Returns a numpy array.
    """
    if len(samples) < frame_len:
        return np.empty((0, frame_len // 2 + 1), dtype=samples.dtype)
    win = np.hanning(frame_len)
    hopsize = sample_rate // fps
    num_frames = max(0, (len(samples) - frame_len) // hopsize + 1)
    batch = min(batch, num_frames)
    if batch <= 1 or not samples.flags.c_contiguous:
        rfft = rfft_builder(samples[:frame_len], n=frame_len)
        spect = np.vstack(
            (rfft(samples[pos:pos + frame_len] * win))
            for pos in range(0,
                             len(samples) - frame_len + 1, int(hopsize)))
    else:
        rfft = rfft_builder(np.empty((batch, frame_len), samples.dtype),
                            n=frame_len,
                            threads=1)
        frames = np.lib.stride_tricks.as_strided(
            samples,
            shape=(num_frames, frame_len),
            strides=(samples.strides[0] * hopsize, samples.strides[0]))
        spect = [(rfft(frames[pos:pos + batch] * win))
                 for pos in range(0, num_frames - batch + 1, batch)]
        if num_frames % batch:
            spect.extend(
                spectrogram(samples[(num_frames // batch * batch) * hopsize:],
                            sample_rate,
                            frame_len,
                            fps,
                            batch=1))
        spect = np.vstack(spect)

        if save_input:
            # extract magnitude and phase from the input audio.
            # returns magnitude and phase arrays in polar form. so, spect = magnitudes * phases. to find phase just use np.exp(np.angle(D) * j * 1)
            magnitudes, phases = librosa.core.magphase(spect.T)
            '''spect_recon = magnitudes * phases  # * is element-wise multiplication
            
            # inverting            
            win_len = frame_len
            ifft_window = np.hanning(win_len)
            
            n_frames = spect_recon.shape[1]
            expected_signal_len = frame_len + hopsize * (n_frames - 1)   # How? but important
            audio_recon = np.zeros(expected_signal_len)
                
            for i in range(n_frames):
                sample = i * hopsize
                spec = spect_recon[:, i].flatten()
                spec = np.concatenate((spec.conj(), spec[-2:0:-1]), 0)  # not clear? but expands the 513 input to 1024 as DFT is symmetric
                ytmp = ifft_window * np.fft.irfft(spec, n = frame_len)
        
                audio_recon[sample:(sample + frame_len)] = audio_recon[sample:(sample + frame_len)] + ytmp
            
            librosa.output.write_wav(os.path.join(dump_path, 'input_audio_recon.wav'), audio_recon, sample_rate)'''

            # saving all the phase information to be used while reconstructing from saliency maps.
            # phases.shape: (d, t)
            np.savez(os.path.join(dump_path, 'amp'), **{'amp': magnitudes.T})
            np.savez(os.path.join(dump_path, 'phases'), **{'phases': phases.T})

        # done this as due to the previous code datatype mismatch happens while returning from function call.
        spect = magnitudes.T

    # comes here two times.
    return spect