class AudioLoader(object): def __init__(self, sampling_rate, window_length, hop_size, dynamic_range_dB=50, normalize=True): super(AudioLoader, self).__init__() self._sampling_rate = sampling_rate self._window_length = window_length self._hop_size = hop_size self._dynamic_range_dB = dynamic_range_dB self._normalize = normalize self._anStft = GaussTruncTF(hop_size=hop_size, stft_channels=window_length) def hopSize(self): return self._hop_size def windowLength(self): return self._window_length def loadSound(self, file_name): audio, sr = librosa.load(file_name, sr=self._sampling_rate, dtype=np.float64) return preprocess_signal(audio) def computeSpectrogram(self, audio): audio = audio[:len(audio)-np.mod(len(audio), self._window_length)] audio = audio[:len(audio)-np.mod(len(audio), self._hop_size)] spectrogram = self._anStft.spectrogram(audio) logSpectrogram = log_spectrogram(spectrogram, dynamic_range_dB=self._dynamic_range_dB) logSpectrogram = logSpectrogram / (self._dynamic_range_dB / 2) + 1 return logSpectrogram def loadAsSpectrogram(self, file_name): audio = self.loadSound(file_name) return self.computeSpectrogram(audio)
class SpectrogramInverter(object): def __init__(self, fft_size, fft_hop_size): super().__init__() self._hop_size = fft_hop_size self._anStft = GaussTruncTF(hop_size=fft_hop_size, stft_channels=fft_size) def _magnitudeErr(self, targetSpectrogram, originalSpectrogram): return np.linalg.norm(np.abs(targetSpectrogram) - np.abs(originalSpectrogram), 'fro') / \ np.linalg.norm(np.abs(targetSpectrogram), 'fro') def invertSpectrograms(self, unprocessed_spectrograms): reconstructed_audio_signals = np.zeros([unprocessed_spectrograms.shape[0], self._hop_size*unprocessed_spectrograms.shape[2]]) for index, spectrogram in enumerate(unprocessed_spectrograms): reconstructed_audio_signals[index] = self._invertSpectrogram(spectrogram) return reconstructed_audio_signals def projectionLoss(self, unprocessed_spectrograms): reconstructed_audio_signals = self.invertSpectrograms(unprocessed_spectrograms) _projection_loss = np.zeros([unprocessed_spectrograms.shape[0]]) for index, spectrogram in enumerate(unprocessed_spectrograms): reconstructed_spectrogram = self._anStft.spectrogram(reconstructed_audio_signals[index], normalize=False) _projection_loss[index] = projection_loss(reconstructed_spectrogram[:-1], spectrogram) return _projection_loss def projectionLossBetween(self, unprocessed_spectrograms, audio_signals): _projection_loss = np.zeros([unprocessed_spectrograms.shape[0]]) for index, audio_signal in enumerate(audio_signals): reconstructed_spectrogram = self._anStft.spectrogram(audio_signal, normalize=False) _projection_loss[index] = projection_loss(reconstructed_spectrogram[:-1], unprocessed_spectrograms[index]) return _projection_loss def _invertSpectrogram(self, unprocessed_spectrogram): unprocessed_spectrogram = np.concatenate([unprocessed_spectrogram, np.ones_like(unprocessed_spectrogram)[0:1, :]*unprocessed_spectrogram.min()] , axis=0) # Fill last column of freqs with zeros return self._anStft.invert_spectrogram(unprocessed_spectrogram)
def pghi_stft(x): use_truncated_window = True if use_truncated_window: stft_system = GaussTruncTF( hop_size=getattr(self, 'hop_size', 256), stft_channels=getattr(self, 'stft_channels', 512)) else: stft_system = GaussTF(hop_size=getattr(self, 'hop_size', 256), stft_channels=getattr( self, 'stft_channels', 512)) Y = stft_system.spectrogram(x) log_Y = log_spectrogram(Y) return np.expand_dims(log_Y, axis=0)
def compute_mag_mel(y): '''Compute spectrogram and MEL spectrogram from signal. Args: y : signal Returns: mel: A 2d array of shape (T, n_mels) and dtype of float32. mag: A 2d array of shape (T, 1+stft_channels/2) and dtype of float32. ''' if p.use_truncated: tfsystem = GaussTruncTF(hop_size=p.hop_size, stft_channels=p.stft_channels) else: tfsystem = GaussTF(hop_size=p.hop_size, stft_channels=p.stft_channels) # magnitude spectrogram mag = tfsystem.spectrogram(y, normalize=p.normalize) # mel spectrogram mel = mel_spectrogram(mag, stft_channels=p.stft_channels, n_mels=p.n_mels, fmin=p.fmin, fmax=p.fmax, sr=p.sr) # to decibel mag = log_spectrogram(mag, dynamic_range_dB=p.stft_dynamic_range_dB ) / p.stft_dynamic_range_dB + 1 assert (np.max(mag) <= 1) assert (np.min(mag) >= 0) # Reduction rate if p.reduction_rate > 1: mel = downsample_tf_time(mel, p.reduction_rate) mel = log_spectrogram(mel, dynamic_range_dB=p.mel_dynamic_range_dB ) / p.mel_dynamic_range_dB + 1 # Float32 mel = mel.astype(np.float32) mag = mag.astype(np.float32) return mel, mag