def spectrogram(samples, sample_rate, frame_len, fps, batch=50): """ Computes a magnitude spectrogram for a given vector of samples at a given sample rate (in Hz), frame length (in samples) and frame rate (in Hz). Allows to transform multiple frames at once for improved performance (with a default value of 50, more is not always better). Returns a numpy array. """ if len(samples) < frame_len: return np.empty((0, frame_len // 2 + 1), dtype=samples.dtype) win = np.hanning(frame_len) hopsize = sample_rate // fps num_frames = max(0, (len(samples) - frame_len) // hopsize + 1) batch = min(batch, num_frames) if batch <= 1 or not samples.flags.c_contiguous: rfft = rfft_builder(samples[:frame_len], n=frame_len) spect = np.vstack(np.abs(rfft(samples[pos:pos + frame_len] * win)) for pos in range(0, len(samples) - frame_len + 1, int(hopsize))) else: rfft = rfft_builder(np.empty((batch, frame_len), samples.dtype), n=frame_len, threads=1) frames = np.lib.stride_tricks.as_strided( samples, shape=(num_frames, frame_len), strides=(samples.strides[0] * hopsize, samples.strides[0])) spect = [np.abs(rfft(frames[pos:pos + batch] * win)) for pos in range(0, num_frames - batch + 1, batch)] if num_frames % batch: spect.extend(spectrogram( samples[(num_frames // batch * batch) * hopsize:], sample_rate, frame_len, fps, batch=1)) spect = np.vstack(spect) return spect
def spectrogram_plans(frame_len, batch=48, dtype=np.float32): """ Precompute plans for spectrogram(), for a given frame length, batch size and dtype. Returns two plans (single spectrum and batch), and a window. """ input_array = empty_aligned((batch, frame_len), dtype=dtype) win = np.hanning(frame_len).astype(dtype) return (rfft_builder(input_array[0]), rfft_builder(input_array), win)
def __new__(cls, frames, window=np.hanning, fft_size=None, circular_shift=False, include_nyquist=False, fft_window=None, fftw=None, **kwargs): # pylint: disable=unused-argument if isinstance(frames, ShortTimeFourierTransform): # already a STFT, use the frames thereof frames = frames.frames # instantiate a FramedSignal if needed if not isinstance(frames, FramedSignal): frames = FramedSignal(frames, **kwargs) # size of the frames frame_size = frames.shape[1] if fft_window is None: # if a callable window function is given, use the frame size to # create a window of this size if hasattr(window, '__call__'): window = window(frame_size) # window used for FFT try: # if the signal is not scaled, scale the window accordingly max_range = float(np.iinfo(frames.signal.dtype).max) try: # scale the window by the max_range fft_window = window / max_range except TypeError: # if the window is None we can't scale it, thus create a # uniform window and scale it accordingly fft_window = np.ones(frame_size) / max_range except ValueError: # no scaling needed, use the window as is (can also be None) fft_window = window # use FFTW to speed up STFT try: # Note: use fft_window instead of a frame because it has already # the correct dtype (frames are multiplied with this window) fftw = rfft_builder(fft_window, fft_size, axis=0) except AttributeError: pass # calculate the STFT data = stft(frames, fft_window, fft_size=fft_size, circular_shift=circular_shift, include_nyquist=include_nyquist, fftw=fftw) # cast as ShortTimeFourierTransform obj = np.asarray(data).view(cls) # save the other parameters obj.frames = frames obj.window = window obj.fft_window = fft_window obj.fft_size = fft_size if fft_size else frame_size obj.circular_shift = circular_shift obj.include_nyquist = include_nyquist # return the object return obj
def filtered_stft(samples, frame_len, hop_size, filterbank): """ Computes an STFT, applying a filterbank on the way to minimize memory use. """ window = np.hanning(frame_len) rfft = rfft_builder(samples[:frame_len], n=frame_len) spect = np.vstack( np.dot( np.abs(rfft(samples[pos:pos + frame_len] * window))[:len(filterbank)], filterbank) for pos in range(0, len(samples) - frame_len + 1, hop_size)) return spect
def spectrogram_partial(samples, sample_rate, frame_len, fps, save_input, dump_path, batch=50): """ Computes a magnitude spectrogram for a given vector of samples at a given sample rate (in Hz), frame length (in samples) and frame rate (in Hz). Allows to transform multiple frames at once for improved performance (with a default value of 50, more is not always better). Returns a numpy array. """ if len(samples) < frame_len: return np.empty((0, frame_len // 2 + 1), dtype=samples.dtype) win = np.hanning(frame_len) hopsize = sample_rate // fps num_frames = max(0, (len(samples) - frame_len) // hopsize + 1) batch = min(batch, num_frames) if batch <= 1 or not samples.flags.c_contiguous: rfft = rfft_builder(samples[:frame_len], n=frame_len) spect = np.vstack( (rfft(samples[pos:pos + frame_len] * win)) for pos in range(0, len(samples) - frame_len + 1, int(hopsize))) else: rfft = rfft_builder(np.empty((batch, frame_len), samples.dtype), n=frame_len, threads=1) frames = np.lib.stride_tricks.as_strided( samples, shape=(num_frames, frame_len), strides=(samples.strides[0] * hopsize, samples.strides[0])) spect = [(rfft(frames[pos:pos + batch] * win)) for pos in range(0, num_frames - batch + 1, batch)] if num_frames % batch: spect.extend( spectrogram(samples[(num_frames // batch * batch) * hopsize:], sample_rate, frame_len, fps, batch=1)) spect = np.vstack(spect) if save_input: # extract magnitude and phase from the input audio. # returns magnitude and phase arrays in polar form. so, spect = magnitudes * phases. to find phase just use np.exp(np.angle(D) * j * 1) magnitudes, phases = librosa.core.magphase(spect.T) '''spect_recon = magnitudes * phases # * is element-wise multiplication # inverting win_len = frame_len ifft_window = np.hanning(win_len) n_frames = spect_recon.shape[1] expected_signal_len = frame_len + hopsize * (n_frames - 1) # How? but important audio_recon = np.zeros(expected_signal_len) for i in range(n_frames): sample = i * hopsize spec = spect_recon[:, i].flatten() spec = np.concatenate((spec.conj(), spec[-2:0:-1]), 0) # not clear? but expands the 513 input to 1024 as DFT is symmetric ytmp = ifft_window * np.fft.irfft(spec, n = frame_len) audio_recon[sample:(sample + frame_len)] = audio_recon[sample:(sample + frame_len)] + ytmp librosa.output.write_wav(os.path.join(dump_path, 'input_audio_recon.wav'), audio_recon, sample_rate)''' # saving all the phase information to be used while reconstructing from saliency maps. # phases.shape: (d, t) np.savez(os.path.join(dump_path, 'amp'), **{'amp': magnitudes.T}) np.savez(os.path.join(dump_path, 'phases'), **{'phases': phases.T}) # done this as due to the previous code datatype mismatch happens while returning from function call. spect = magnitudes.T # comes here two times. return spect