def spectrogram_summary(audio, audio_gen, step, name=''): """Writes a summary of spectrograms for a batch of images.""" specgram = lambda a: ddsp.spectral_ops.compute_logmag(tf_float32(a), size=768) # Batch spectrogram operations spectrograms = specgram(audio) spectrograms_gen = specgram(audio_gen) batch_size = int(audio.shape[0]) for i in range(batch_size): # Manually specify exact size of fig for tensorboard fig, axs = plt.subplots(2, 1, figsize=(8, 8)) ax = axs[0] spec = np.rot90(spectrograms[i]) ax.matshow(spec, vmin=-5, vmax=1, aspect='auto', cmap=plt.cm.magma) ax.set_title('original') ax.set_xticks([]) ax.set_yticks([]) ax = axs[1] spec = np.rot90(spectrograms_gen[i]) ax.matshow(spec, vmin=-5, vmax=1, aspect='auto', cmap=plt.cm.magma) ax.set_title('synthesized') ax.set_xticks([]) ax.set_yticks([]) # Format and save plot to image name = name + '_' if name else '' tag = 'spectrogram/{}{}'.format(name, i + 1) fig_summary(tag, fig, step)
def stft(audio, frame_size=2048, overlap=0.75, pad_end=True): """Differentiable stft in tensorflow, computed in batch.""" audio = tf_float32(audio) assert frame_size * overlap % 2.0 == 0.0 s = tf.signal.stft(signals=audio, frame_length=int(frame_size), frame_step=int(frame_size * (1.0 - overlap)), fft_length=int(frame_size), pad_end=pad_end) return s
def setUp(self): """Create some dummy input data for the chain.""" super().setUp() # Create inputs. self.n_batch = 4 self.n_frames = 1001 self.n_samples = 64000 inputs = { 'loudness_db': np.zeros([self.n_batch, self.n_frames]), 'f0_hz': np.zeros([self.n_batch, self.n_frames]), 'audio': np.random.randn(self.n_batch, self.n_samples), } self.inputs = {k: tf_float32(v) for k, v in inputs.items()}
def compute_mag(audio, size=2048, overlap=0.75, pad_end=True): mag = tf.abs(stft(audio, frame_size=size, overlap=overlap, pad_end=pad_end)) return tf_float32(mag)
def compute_loudness(audio, sample_rate=16000, frame_rate=250, n_fft=2048, range_db=LD_RANGE, ref_db=20.7, use_tf=False): """Perceptual loudness in dB, relative to white noise, amplitude=1. Function is differentiable if use_tf=True. Args: audio: Numpy ndarray or tensor. Shape [batch_size, audio_length] or [batch_size,]. sample_rate: Audio sample rate in Hz. frame_rate: Rate of loudness frames in Hz. n_fft: Fft window size. range_db: Sets the dynamic range of loudness in decibles. The minimum loudness (per a frequency bin) corresponds to -range_db. ref_db: Sets the reference maximum perceptual loudness as given by (A_weighting + 10 * log10(abs(stft(audio))**2.0). The default value corresponds to white noise with amplitude=1.0 and n_fft=2048. There is a slight dependence on fft_size due to different granularity of perceptual weighting. use_tf: Make function differentiable by using librosa. Returns: Loudness in decibels. Shape [batch_size, n_frames] or [n_frames,]. """ # Pick tensorflow or numpy. lib = tf if use_tf else np # Make inputs tensors for tensorflow. audio = tf_float32(audio) if use_tf else audio # Temporarily a batch dimension for single examples. is_1d = (len(audio.shape) == 1) audio = audio[lib.newaxis, :] if is_1d else audio # Take STFT. hop_size = sample_rate // frame_rate overlap = 1 - hop_size / n_fft stft_fn = stft if use_tf else stft_np s = stft_fn(audio, frame_size=n_fft, overlap=overlap, pad_end=True) # Compute power amplitude = lib.abs(s) log10 = ( lambda x: tf.math.log(x) / tf.math.log(10.0)) if use_tf else np.log10 amin = 1e-20 # Avoid log(0) instabilities. power_db = log10(lib.maximum(amin, amplitude)) power_db *= 20.0 # Perceptual weighting. frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=n_fft) a_weighting = librosa.A_weighting(frequencies)[lib.newaxis, lib.newaxis, :] loudness = power_db + a_weighting # Set dynamic range. loudness -= ref_db loudness = lib.maximum(loudness, -range_db) mean = tf.reduce_mean if use_tf else np.mean # Average over frequency bins. loudness = mean(loudness, axis=-1) # Remove temporary batch dimension. loudness = loudness[0] if is_1d else loudness return loudness
def get_spectrogram(audio, rotate=False, size=1024): """Compute logmag spectrogram.""" mag = ddsp.spectral_ops.compute_logmag(tf_float32(audio), size=size) if rotate: mag = np.rot90(mag) return mag
def call(self, target_audio, audio): audio, target_audio = tf_float32(audio), tf_float32(target_audio) target_emb = self.pretrained_model(target_audio) synth_emb = self.pretrained_model(audio) loss = self.weight * mean_difference(target_emb, synth_emb, self.loss_type) return loss