def resample(y, orig_sr, target_sr, res_type='kaiser_best', fix=True, scale=False, **kwargs): # First, validate the audio buffer util.valid_audio(y, mono=False) if orig_sr == target_sr: return y ratio = float(target_sr) / orig_sr n_samples = int(np.ceil(y.shape[-1] * ratio)) if res_type == 'scipy': y_hat = scipy.signal.resample(y, n_samples, axis=-1) else: y_hat = resampy.resample(y, orig_sr, target_sr, filter=res_type, axis=-1) if fix: y_hat = util.fix_length(y_hat, n_samples, **kwargs) if scale: y_hat /= np.sqrt(ratio) return np.ascontiguousarray(y_hat, dtype=y.dtype)
def to_mono(y): # Validate the buffer. Stereo is ok here. util.valid_audio(y, mono=False) if y.ndim > 1: y = np.mean(y, axis=0) return y
def get_features(self, y, sample_rate): # convert to mono if self.mono: y = np.mean(y, axis=1, keepdims=True) # resample if sample rates mismatch if (self.sample_rate is not None) and (self.sample_rate != sample_rate): if y.shape[1] == 1: # librosa expects mono audio to be of shape (n,), but we have (n, 1). y = librosa.core.resample(y[:, 0], sample_rate, self.sample_rate)[:, None] else: y = librosa.core.resample(y.T, sample_rate, self.sample_rate).T sample_rate = self.sample_rate # augment data if self.augmentation is not None: y = self.augmentation(y, sample_rate) # TODO: how time consuming is this thing (needs profiling...) try: valid = valid_audio(y[:, 0], mono=True) except ParameterError as e: msg = f"Something went wrong when augmenting waveform." raise ValueError(msg) return y
def libstft(y, fs, n_fft=2048, hop_length=None, win_length=None, window='hann', center=None, dtype=np.complex64, pad_mode='reflect'): # By default, use the entire frame if win_length is None: win_length = n_fft # Set the default hop, if it's not already specified if hop_length is None: hop_length = int(win_length // 4) fft_window = get_window(window, win_length, fftbins=True) # Pad the window out to n_fft size fft_window = util.pad_center(fft_window, n_fft) # Reshape so that the window can be broadcast fft_window = fft_window.reshape((-1, 1)) # Check audio is valid util.valid_audio(y) # Pad the time series so that frames are centered if center: y = np.pad(y, int(n_fft // 2), mode=pad_mode) # Window the time series. y_frames = util.frame(y, frame_length=win_length, hop_length=hop_length) # Pre-allocate the STFT matrix stft_matrix = np.empty((int(1 + n_fft // 2), y_frames.shape[1]), dtype=dtype, order='F') # how many columns can we fit within MAX_MEM_BLOCK? n_columns = int(util.MAX_MEM_BLOCK / (stft_matrix.shape[0] * stft_matrix.itemsize)) for bl_s in range(0, stft_matrix.shape[1], n_columns): bl_t = min(bl_s + n_columns, stft_matrix.shape[1]) stft_matrix[:, bl_s:bl_t] = fft.fft(fft_window * y_frames[:, bl_s:bl_t], axis=0)[:stft_matrix.shape[0]] f = np.linspace(0, np.pi, stft_matrix.shape[0], endpoint=True) * fs / np.pi / 2 return stft_matrix, f
def zero_crossing_rate(y, frame_length=2048, hop_length=512, center=True, **kwargs): global CROSSING util.valid_audio(y) if center: y = np.pad(y, int(frame_length // 2), mode='edge') y_framed = util.frame(y, frame_length, hop_length) kwargs['axis'] = 0 kwargs.setdefault('pad', False) crossings = zero_crossings(y_framed, **kwargs) CROSSING = crossings print(crossings) return np.mean(crossings, axis=0, keepdims=True)
def get_mfcc(self, sig_frm): sig_frm = sig_frm / 32768.0 window = 'hamming' win_length = sig_frm.shape[0] hop_length = win_length center = True n_fft = win_length fft_window = get_window(window, win_length, fftbins=True) fft_window = util.pad_center(fft_window, n_fft) fft_window = fft_window.reshape((-1, 1)) util.valid_audio(sig_frm) sig_frm = sig_frm[:, None] stft_matrix = np.empty((int(1 + n_fft // 2), 1), dtype=np.complex64, order='F') stft = fft.fft(fft_window * sig_frm, axis=0)[:stft_matrix.shape[0]].conj() powspec = np.abs(stft)**2 melspec = librosa.feature.melspectrogram(S=powspec, hop_length=hop_length, n_fft=n_fft, n_mels=40) mfcc = librosa.feature.mfcc(S=librosa.logamplitude(melspec), n_mfcc=13) n_fft = 512 fft_window = get_window(window, win_length, fftbins=True) fft_window = util.pad_center(fft_window, n_fft) fft_window = fft_window.reshape((-1, 1)) y = np.pad(sig_frm[:, 0], int(n_fft // 2), mode='reflect') pad_frame = librosa.util.frame(y, frame_length=n_fft, hop_length=win_length * 2)[:, 0][:, None] stft_matrix = np.empty((int(1 + n_fft // 2), 1), dtype=np.complex64, order='F') stft = fft.fft(fft_window * pad_frame, axis=0)[:stft_matrix.shape[0]].conj() powspec = np.abs(stft)**2 power_to_db = getattr(librosa, 'power_to_db') spec = power_to_db(powspec) self.spec_tape_add(spec) return mfcc
def lpc(y, order): """Linear Prediction Coefficients via Burg's method This function applies Burg's method to estimate coefficients of a linear filter on `y` of order `order`. Burg's method is an extension to the Yule-Walker approach, which are both sometimes referred to as LPC parameter estimation by autocorrelation. It follows the description and implementation approach described in the introduction in [1]_. N.B. This paper describes a different method, which is not implemented here, but has been chosen for its clear explanation of Burg's technique in its introduction. .. [1] Larry Marple A New Autoregressive Spectrum Analysis Algorithm IEEE Transactions on Accoustics, Speech, and Signal Processing vol 28, no. 4, 1980 Parameters ---------- y : np.ndarray Time series to fit order : int > 0 Order of the linear filter Returns ------- a : np.ndarray of length order + 1 LP prediction error coefficients, i.e. filter denominator polynomial Raises ------ ParameterError - If y is not valid audio as per `util.valid_audio` - If order < 1 or not integer FloatingPointError - If y is ill-conditioned See also -------- scipy.signal.lfilter Examples -------- Compute LP coefficients of y at order 16 on entire series >>> y, sr = librosa.load(librosa.util.example_audio_file(), offset=30, ... duration=10) >>> librosa.lpc(y, 16) Compute LP coefficients, and plot LP estimate of original series >>> import matplotlib.pyplot as plt >>> import scipy >>> y, sr = librosa.load(librosa.util.example_audio_file(), offset=30, ... duration=0.020) >>> a = librosa.lpc(y, 2) >>> y_hat = scipy.signal.lfilter([0] + -1*a[1:], [1], y) >>> plt.figure() >>> plt.plot(y) >>> plt.plot(y_hat, linestyle='--') >>> plt.legend(['y', 'y_hat']) >>> plt.title('LP Model Forward Prediction') >>> plt.show() """ if not isinstance(order, int) or order < 1: raise ParameterError("order must be an integer > 0") util.valid_audio(y, mono=True) return __lpc(y, order)
def waveplot( y, sr=22050, max_points=5e4, x_axis="time", offset=0.0, max_sr=1000, ax=None, **kwargs, ): """Plot the amplitude envelope of a waveform. If ``y`` is monophonic, a filled curve is drawn between ``[-abs(y), abs(y)]``. If ``y`` is stereo, the curve is drawn between ``[-abs(y[1]), abs(y[0])]``, so that the left and right channels are drawn above and below the axis, respectively. Long signals (``duration >= max_points``) are down-sampled to at most ``max_sr`` before plotting. .. warning:: This function is deprecated in librosa 0.8.1 and will be removed in 0.9.0. Its functionality is replaced and extended by `waveshow`. Parameters ---------- y : np.ndarray [shape=(n,) or (2,n)] audio time series (mono or stereo) sr : number > 0 [scalar] sampling rate of ``y`` max_points : positive number or None Maximum number of time-points to plot: if ``max_points`` exceeds the duration of ``y``, then ``y`` is downsampled. If `None`, no downsampling is performed. x_axis : str or None Display of the x-axis ticks and tick markers. Accepted values are: - 'time' : markers are shown as milliseconds, seconds, minutes, or hours. Values are plotted in units of seconds. - 's' : markers are shown as seconds. - 'ms' : markers are shown as milliseconds. - 'lag' : like time, but past the halfway point counts as negative values. - 'lag_s' : same as lag, but in seconds. - 'lag_ms' : same as lag, but in milliseconds. - `None`, 'none', or 'off': ticks and tick markers are hidden. ax : matplotlib.axes.Axes or None Axes to plot on instead of the default `plt.gca()`. offset : float Horizontal offset (in seconds) to start the waveform plot max_sr : number > 0 [scalar] Maximum sampling rate for the visualization kwargs Additional keyword arguments to `matplotlib.pyplot.fill_between` Returns ------- pc : matplotlib.collections.PolyCollection The PolyCollection created by `fill_between`. See also -------- waveshow librosa.resample matplotlib.pyplot.fill_between Examples -------- Plot a monophonic waveform >>> import matplotlib.pyplot as plt >>> y, sr = librosa.load(librosa.ex('choice'), duration=10) >>> fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True) >>> librosa.display.waveplot(y, sr=sr, ax=ax[0]) >>> ax[0].set(title='Monophonic') >>> ax[0].label_outer() Or a stereo waveform >>> y, sr = librosa.load(librosa.ex('choice', hq=True), mono=False, duration=10) >>> librosa.display.waveplot(y, sr=sr, ax=ax[1]) >>> ax[1].set(title='Stereo') >>> ax[1].label_outer() Or harmonic and percussive components with transparency >>> y, sr = librosa.load(librosa.ex('choice'), duration=10) >>> y_harm, y_perc = librosa.effects.hpss(y) >>> librosa.display.waveplot(y_harm, sr=sr, alpha=0.25, ax=ax[2]) >>> librosa.display.waveplot(y_perc, sr=sr, color='r', alpha=0.5, ax=ax[2]) >>> ax[2].set(title='Harmonic + Percussive') """ util.valid_audio(y, mono=False) if not (isinstance(max_sr, (int, np.integer)) and max_sr > 0): raise ParameterError("max_sr must be a non-negative integer") target_sr = sr hop_length = 1 # Pad an extra channel dimension, if necessary if y.ndim == 1: y = y[np.newaxis, :] if max_points is not None: if max_points <= 0: raise ParameterError("max_points must be strictly positive") if max_points < y.shape[-1]: target_sr = min(max_sr, (sr * y.shape[-1]) // max_points) hop_length = sr // target_sr # Reduce by envelope calculation y = __envelope(y, hop_length) y_top = y[0] y_bottom = -y[-1] axes = __check_axes(ax) kwargs.setdefault("color", next(axes._get_lines.prop_cycler)["color"]) locs = offset + core.times_like(y_top, sr=sr, hop_length=hop_length) out = axes.fill_between(locs, y_bottom, y_top, **kwargs) axes.set_xlim([locs.min(), locs.max()]) # Construct tickers and locators __decorate_axis(axes.xaxis, x_axis) return out
def waveshow( y, sr=22050, max_points=11025, x_axis="time", offset=0.0, marker="", where="post", label=None, ax=None, **kwargs, ): """Visualize a waveform in the time domain. This function constructs a plot which adaptively switches between a raw samples-based view of the signal (`matplotlib.pyplot.step`) and an amplitude-envelope view of the signal (`matplotlib.pyplot.fill_between`) depending on the time extent of the plot's viewport. More specifically, when the plot spans a time interval of less than ``max_points / sr`` (by default, 1/2 second), the samples-based view is used, and otherwise a downsampled amplitude envelope is used. This is done to limit the complexity of the visual elements to guarantee an efficient, visually interpretable plot. When using interactive rendering (e.g., in a Jupyter notebook or IPython console), the plot will automatically update as the view-port is changed, either through widget controls or programmatic updates. .. note:: When visualizing stereo waveforms, the amplitude envelope will be generated so that the upper limits derive from the left channel, and the lower limits derive from the right channel, which can produce a vertically asymmetric plot. When zoomed in to the sample view, only the first channel will be shown. If you want to visualize both channels at the sample level, it is recommended to plot each signal independently. Parameters ---------- y : np.ndarray [shape=(n,) or (2,n)] audio time series (mono or stereo) sr : number > 0 [scalar] sampling rate of ``y`` (samples per second) max_points : positive integer Maximum number of samples to draw. When the plot covers a time extent smaller than ``max_points / sr`` (default: 1/2 second), samples are drawn. If drawing raw samples would exceed `max_points`, then a downsampled amplitude envelope extracted from non-overlapping windows of `y` is visualized instead. The parameters of the amplitude envelope are defined so that the resulting plot cannot produce more than `max_points` frames. x_axis : str or None Display of the x-axis ticks and tick markers. Accepted values are: - 'time' : markers are shown as milliseconds, seconds, minutes, or hours. Values are plotted in units of seconds. - 's' : markers are shown as seconds. - 'ms' : markers are shown as milliseconds. - 'lag' : like time, but past the halfway point counts as negative values. - 'lag_s' : same as lag, but in seconds. - 'lag_ms' : same as lag, but in milliseconds. - `None`, 'none', or 'off': ticks and tick markers are hidden. ax : matplotlib.axes.Axes or None Axes to plot on instead of the default `plt.gca()`. offset : float Horizontal offset (in seconds) to start the waveform plot marker : string Marker symbol to use for sample values. (default: no markers) See also: `matplotlib.markers`. where : string, {'pre', 'mid', 'post'} This setting determines how both waveform and envelope plots interpolate between observations. See `matplotlib.pyplot.step` for details. Default: 'post' label : string [optional] The label string applied to this plot. Note that the label kwargs Additional keyword arguments to `matplotlib.pyplot.fill_between` and `matplotlib.pyplot.step`. Note that only those arguments which are common to both functions will be supported. Returns ------- librosa.display.AdaptiveWaveplot An object of type `librosa.display.AdaptiveWaveplot` See also -------- AdaptiveWaveplot matplotlib.pyplot.step matplotlib.pyplot.fill_between matplotlib.markers Examples -------- Plot a monophonic waveform with an envelope view >>> import matplotlib.pyplot as plt >>> y, sr = librosa.load(librosa.ex('choice'), duration=10) >>> fig, ax = plt.subplots(nrows=3, sharex=True) >>> librosa.display.waveshow(y, sr=sr, ax=ax[0]) >>> ax[0].set(title='Envelope view, mono') >>> ax[0].label_outer() Or a stereo waveform >>> y, sr = librosa.load(librosa.ex('choice', hq=True), mono=False, duration=10) >>> librosa.display.waveshow(y, sr=sr, ax=ax[1]) >>> ax[1].set(title='Envelope view, stereo') >>> ax[1].label_outer() Or harmonic and percussive components with transparency >>> y, sr = librosa.load(librosa.ex('choice'), duration=10) >>> y_harm, y_perc = librosa.effects.hpss(y) >>> librosa.display.waveshow(y_harm, sr=sr, alpha=0.5, ax=ax[2], label='Harmonic') >>> librosa.display.waveshow(y_perc, sr=sr, color='r', alpha=0.5, ax=ax[2], label='Percussive') >>> ax[2].set(title='Multiple waveforms') >>> ax[2].legend() Zooming in on a plot to show raw sample values >>> fig, (ax, ax2) = plt.subplots(nrows=2, sharex=True) >>> ax.set(xlim=[6.0, 6.01], title='Sample view', ylim=[-0.2, 0.2]) >>> librosa.display.waveshow(y, sr=sr, ax=ax, marker='.', label='Full signal') >>> librosa.display.waveshow(y_harm, sr=sr, alpha=0.5, ax=ax2, label='Harmonic') >>> librosa.display.waveshow(y_perc, sr=sr, color='r', alpha=0.5, ax=ax2, label='Percussive') >>> ax.label_outer() >>> ax.legend() >>> ax2.legend() """ util.valid_audio(y, mono=False) # Pad an extra channel dimension, if necessary if y.ndim == 1: y = y[np.newaxis, :] if max_points <= 0: raise ParameterError( "max_points={} must be strictly positive".format(max_points)) # Create the adaptive drawing object axes = __check_axes(ax) if "color" not in kwargs: kwargs.setdefault("color", next(axes._get_lines.prop_cycler)["color"]) # Reduce by envelope calculation # this choice of hop ensures that the envelope has at most max_points values hop_length = max(1, y.shape[-1] // max_points) y_env = __envelope(y, hop_length) # Split the envelope into top and bottom y_bottom, y_top = -y_env[-1], y_env[0] times = offset + core.times_like(y, sr=sr, hop_length=1) # Only plot up to max_points worth of data here (steps, ) = axes.step(times[:max_points], y[0, :max_points], marker=marker, where=where, **kwargs) envelope = axes.fill_between( times[:len(y_top) * hop_length:hop_length], y_bottom, y_top, step=where, label=label, **kwargs, ) adaptor = AdaptiveWaveplot(times, y[0], steps, envelope, sr=sr, max_samples=max_points) axes.callbacks.connect("xlim_changed", adaptor.update) # Force an initial update to ensure the state is consistent adaptor.update(axes) # Construct tickers and locators __decorate_axis(axes.xaxis, x_axis) return adaptor
def stft(y, n_fft=2048, hop_length=None, win_length=None, window='hann', center=True, dtype=np.complex64, pad_mode='reflect'): """Short-time Fourier transform (STFT) Returns a complex-valued matrix D such that `np.abs(D[f, t])` is the magnitude of frequency bin `f` at frame `t` `np.angle(D[f, t])` is the phase of frequency bin `f` at frame `t` Parameters ---------- y : np.ndarray [shape=(n,)], real-valued the input signal (audio time series) n_fft : int > 0 [scalar] FFT window size hop_length : int > 0 [scalar] number audio of frames between STFT columns. If unspecified, defaults `win_length / 4`. win_length : int <= n_fft [scalar] Each frame of audio is windowed by `window()`. The window will be of length `win_length` and then padded with zeros to match `n_fft`. If unspecified, defaults to ``win_length = n_fft``. window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)] - a window specification (string, tuple, or number); see `scipy.signal.get_window` - a window function, such as `scipy.signal.hanning` - a vector or array of length `n_fft` .. see also:: `filters.get_window` center : boolean - If `True`, the signal `y` is padded so that frame `D[:, t]` is centered at `y[t * hop_length]`. - If `False`, then `D[:, t]` begins at `y[t * hop_length]` dtype : numeric type Complex numeric type for `D`. Default is 64-bit complex. pad_mode : string If `center=True`, the padding mode to use at the edges of the signal. By default, STFT uses reflection padding. Returns ------- D : np.ndarray [shape=(1 + n_fft/2, t), dtype=dtype] STFT matrix See Also -------- istft : Inverse STFT ifgram : Instantaneous frequency spectrogram np.pad : array padding Notes ----- This function caches at level 20. Examples -------- >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> D = np.abs(librosa.stft(y)) >>> D array([[2.58028018e-03, 4.32422794e-02, 6.61255598e-01, ..., 6.82710262e-04, 2.51654536e-04, 7.23036574e-05], [2.49403086e-03, 5.15930466e-02, 6.00107312e-01, ..., 3.48026224e-04, 2.35853557e-04, 7.54836728e-05], [7.82410789e-04, 1.05394892e-01, 4.37517226e-01, ..., 6.29352580e-04, 3.38571583e-04, 8.38094638e-05], ..., [9.48568513e-08, 4.74725084e-07, 1.50052492e-05, ..., 1.85637656e-08, 2.89708542e-08, 5.74304337e-09], [1.25165826e-07, 8.58259284e-07, 1.11157215e-05, ..., 3.49099771e-08, 3.11740926e-08, 5.29926236e-09], [1.70630571e-07, 8.92518756e-07, 1.23656537e-05, ..., 5.33256745e-08, 3.33264900e-08, 5.13272980e-09]], dtype=float32) Use left-aligned frames, instead of centered frames >>> D_left = np.abs(librosa.stft(y, center=False)) Use a shorter hop length >>> D_short = np.abs(librosa.stft(y, hop_length=64)) Display a spectrogram >>> import matplotlib.pyplot as plt >>> librosa.display.specshow(librosa.amplitude_to_db(D, ... ref=np.max), ... y_axis='log', x_axis='time') >>> plt.title('Power spectrogram') >>> plt.colorbar(format='%+2.0f dB') >>> plt.tight_layout() """ # By default, use the entire frame if win_length is None: win_length = n_fft # Set the default hop, if it's not already specified if hop_length is None: hop_length = int(win_length // 4) #fft_window = get_window(window, win_length, fftbins=True) fft_window = vorbis(win_length) # Pad the window out to n_fft size fft_window = util.pad_center(fft_window, n_fft) # Reshape so that the window can be broadcast fft_window = fft_window.reshape((-1, 1)) # Check audio is valid util.valid_audio(y) # Pad the time series so that frames are centered if center: y = np.pad(y, int(n_fft // 2), mode=pad_mode) # Window the time series. y_frames = util.frame(y, frame_length=n_fft, hop_length=hop_length) # Pre-allocate the STFT matrix stft_matrix = np.empty((int(1 + n_fft // 2), y_frames.shape[1]), dtype=dtype, order='F') # how many columns can we fit within MAX_MEM_BLOCK? n_columns = int(util.MAX_MEM_BLOCK / (stft_matrix.shape[0] * stft_matrix.itemsize)) for bl_s in range(0, stft_matrix.shape[1], n_columns): bl_t = min(bl_s + n_columns, stft_matrix.shape[1]) stft_matrix[:, bl_s:bl_t] = fft.fft(fft_window * y_frames[:, bl_s:bl_t], axis=0)[:stft_matrix.shape[0]] return stft_matrix
def hht(self, y, hop_length=None, win_length=None, center=True, dtype=np.complex64, pad_mode='reflect'): """Hilbert-Huang transform (HHT) Parameters ---------- y : np.ndarray [shape=(n,)], real-valued the input signal (audio time series) hop_length : int > 0 [scalar] number audio of frames between STFT columns. If unspecified, defaults `win_length / 4`. win_length : int <= n_fft [scalar] Each frame of audio is windowed by `window()`. The window will be of length `win_length` and then padded with zeros to match `n_fft`. If unspecified, defaults to ``win_length = n_fft``. center : boolean - If `True`, the signal `y` is padded so that frame `D[:, t]` is centered at `y[t * hop_length]`. - If `False`, then `D[:, t]` begins at `y[t * hop_length]` dtype : numeric type Complex numeric type for `D`. Default is 64-bit complex. pad_mode : string If `center=True`, the padding mode to use at the edges of the signal. By default, HHT uses reflection padding. Returns ------- hht_matrix : np.ndarray [shape=(30, t), dtype=dtype] bjp_matrix : np.ndarray [shape=(n_hht-1, t), dtype=dtype] """ # By default, use the entire frame if win_length is None: win_length = self.n_hht # Set the default hop, if it's not already specified if hop_length is None: hop_length = int(win_length / 2) hht_window = self.window # Pad the window out to n_hht size hht_window = util.pad_center(hht_window, self.n_hht) # Reshape so that the window can be broadcast hht_window = hht_window.reshape((-1, 1)) # Check audio is valid util.valid_audio(y) # Pad the time series so that frames are centered if center: y = np.pad(y, self.n_hht - 1, mode=pad_mode) # Window the time series. y_frames = util.frame(y, frame_length=self.n_hht, hop_length=hop_length).T # Pre-allocate the HHT matrix hht_matrix = np.empty((27, y_frames.shape[0]), dtype=dtype, order='F') bjp_matrix = np.empty((self.n_hht - 1, y_frames.shape[0]), dtype=dtype, order='F') for bl_s in range(hht_matrix.shape[1]): frame_signal = hht_window[:, 0] * y_frames[bl_s, :] A, f, bjp = get_hht(frame_signal, self.fs) hht_matrix[:, bl_s] = self.hht_based_feature(A, f * self.fs, bjp) bjp_matrix[:, bl_s] = bjp return hht_matrix, bjp_matrix
def stft(y, n_fft=2048, hop_length=None, win_length=None, window=None, center=True, dtype=np.complex64): import scipy import six from librosa import util # By default, use the entire frame if win_length is None: win_length = n_fft #win_length = tf.constant(n_fft) # Set the default hop, if it's not already specified if hop_length is None: hop_length = int(win_length.value() / 4) #hop_length = win_length/4 #hop_length.to_int64() if window is None: # Default is an asymmetric Hann window fft_window = scipy.signal.hann(win_length, sym=False) #fft_window = tf.constant(scipy.signal.hann(convertTFtoNP(win_length), sym=False)) elif six.callable(window): # User supplied a window function fft_window = window(win_length) else: # User supplied a window vector. # Make sure it's an array: fft_window = np.asarray(window) # validate length compatibility # if fft_window.size != n_fft: # raise ParameterError('Size mismatch between n_fft and len(window)') # Pad the window out to n_fft size fft_window = util.pad_center(fft_window, n_fft) #fft_window.assign(util.pad_center(convertTFtoNP(fft_window), n_fft)) # Reshape so that the window can be broadcast fft_window = fft_window.reshape((-1, 1)) #tf.reshape(fft_window, (-1,1)) # Pad the time series so that frames are centered if center: util.valid_audio(y) y_ = np.pad(convertTFtoNP(y), int(n_fft // 2), mode='reflect') # padding = int(n_fft // 2) # y_frames = tf.pad(y, [[padding, padding],[padding,padding]], mode='REFLECT') # Window the time series. y_frames = util.frame(y_, frame_length=n_fft, hop_length=hop_length) #y_frames.assign(util.frame(convertTFtoNP(y_frames), frame_length=n_fft, hop_length=hop_length)) # Pre-allocate the STFT matrix stft_matrix = np.empty((int(1 + n_fft // 2), y_frames.shape[1]), dtype=dtype, order='F') #stft_matrix = tf.zeros((int(1 + n_fft // 2), y_frames.get_shape()[1]._value), # dtype=dtype, # order='F') # how many columns can we fit within MAX_MEM_BLOCK? n_columns = int(util.MAX_MEM_BLOCK / (stft_matrix.shape[0] * stft_matrix.itemsize)) #n_columns = int(util.MAX_MEM_BLOCK / (stft_matrix.get_shape()[0]._value * # convertTFtoNP(stft_matrix).itemsize)) for bl_s in range(0, stft_matrix.shape[1], n_columns): #for bl_s in range(0, stft_matrix.get_shape()[1]._value, n_columns): bl_t = min(bl_s + n_columns, stft_matrix.shape[1]) #bl_t = min(bl_s + n_columns, stft_matrix.get_shape()[1]._value) # RFFT and Conjugate here to match phase from DPWE code stft_matrix[:, bl_s:bl_t] = scipy.fftpack.fft( fft_window * y_frames[:, bl_s:bl_t], axis=0)[:stft_matrix.shape[0]].conj() #tf.scatter_update(stft_matrix, tf.constant(range(bl_s,bl_t)), tf.conj(tf.slice(tf.fft( # fft_window * tf.slice( # y_frames, [0,bl_s],[y_frames.get_shape()[0]._value,bl_t-bl_s])), # [0],[stft_matrix.get_shape()[0]._value]))) return stft_matrix