def spectrogram_summary(audio, audio_gen, step, name='', tag='spectrogram'): """Writes a summary of spectrograms for a batch of images.""" specgram = lambda a: ddsp.spectral_ops.compute_logmag(tf_float32(a), size=768) # Batch spectrogram operations spectrograms = specgram(audio) spectrograms_gen = specgram(audio_gen) batch_size = int(audio.shape[0]) for i in range(batch_size): # Manually specify exact size of fig for tensorboard fig, axs = plt.subplots(2, 1, figsize=(8, 8)) _plt_spec(spectrograms[i], axs[0], 'original') _plt_spec(spectrograms_gen[i], axs[1], 'synthesized') # Format and save plot to image fig_summary(fig, plot_type='spectrogram', sample_idx=i + 1, **data)
def compute_rms_energy(audio, sample_rate=16000, frame_rate=250, frame_size=2048, pad_end=True): """Compute root mean squared energy of audio.""" audio = tf_float32(audio) hop_size = sample_rate // frame_rate audio_frames = tf.signal.frame(audio, frame_size, hop_size, pad_end=pad_end) rms_energy = tf.reduce_mean(audio_frames**2.0, axis=-1)**0.5 if pad_end: n_samples = audio.shape[0] if len(audio.shape) == 1 else audio.shape[1] n_secs = n_samples / float( sample_rate) # `n_secs` can have milliseconds expected_len = int(n_secs * frame_rate) return pad_or_trim_to_expected_length(rms_energy, expected_len, use_tf=True) else: return rms_energy
def compute_mag(audio, size=2048, overlap=0.75, pad_end=True): mag = tf.abs(stft(audio, frame_size=size, overlap=overlap, pad_end=pad_end)) return tf_float32(mag)
def compute_loudness(audio, sample_rate=16000, frame_rate=250, n_fft=2048, range_db=LD_RANGE, ref_db=20.7, use_tf=False): """Perceptual loudness in dB, relative to white noise, amplitude=1. Function is differentiable if use_tf=True. Args: audio: Numpy ndarray or tensor. Shape [batch_size, audio_length] or [batch_size,]. sample_rate: Audio sample rate in Hz. frame_rate: Rate of loudness frames in Hz. n_fft: Fft window size. range_db: Sets the dynamic range of loudness in decibles. The minimum loudness (per a frequency bin) corresponds to -range_db. ref_db: Sets the reference maximum perceptual loudness as given by (A_weighting + 10 * log10(abs(stft(audio))**2.0). The default value corresponds to white noise with amplitude=1.0 and n_fft=2048. There is a slight dependence on fft_size due to different granularity of perceptual weighting. use_tf: Make function differentiable by using tensorflow. Returns: Loudness in decibels. Shape [batch_size, n_frames] or [n_frames,]. """ if sample_rate % frame_rate != 0: raise ValueError( 'frame_rate: {} must evenly divide sample_rate: {}.' 'For default frame_rate: 250Hz, suggested sample_rate: 16kHz or 48kHz' .format(frame_rate, sample_rate)) # Pick tensorflow or numpy. lib = tf if use_tf else np # Make inputs tensors for tensorflow. audio = tf_float32(audio) if use_tf else audio # Temporarily a batch dimension for single examples. is_1d = (len(audio.shape) == 1) audio = audio[lib.newaxis, :] if is_1d else audio # Take STFT. hop_size = sample_rate // frame_rate overlap = 1 - hop_size / n_fft stft_fn = stft if use_tf else stft_np s = stft_fn(audio, frame_size=n_fft, overlap=overlap, pad_end=True) # Compute power. amplitude = lib.abs(s) power_db = amplitude_to_db(amplitude, use_tf=use_tf) # Perceptual weighting. frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=n_fft) a_weighting = librosa.A_weighting(frequencies)[lib.newaxis, lib.newaxis, :] loudness = power_db + a_weighting # Set dynamic range. loudness -= ref_db loudness = lib.maximum(loudness, -range_db) mean = tf.reduce_mean if use_tf else np.mean # Average over frequency bins. loudness = mean(loudness, axis=-1) # Remove temporary batch dimension. loudness = loudness[0] if is_1d else loudness # Compute expected length of loudness vector n_secs = audio.shape[-1] / float( sample_rate) # `n_secs` can have milliseconds expected_len = int(n_secs * frame_rate) # Pad with `-range_db` noise floor or trim vector loudness = pad_or_trim_to_expected_length(loudness, expected_len, -range_db, use_tf=use_tf) return loudness
def compute_loudness(audio, sample_rate=16000, frame_rate=250, n_fft=2048, range_db=LD_RANGE, ref_db=20.7, use_tf=False): """Perceptual loudness in dB, relative to white noise, amplitude=1. Function is differentiable if use_tf=True. Args: audio: Numpy ndarray or tensor. Shape [batch_size, audio_length] or [batch_size,]. sample_rate: Audio sample rate in Hz. frame_rate: Rate of loudness frames in Hz. n_fft: Fft window size. range_db: Sets the dynamic range of loudness in decibles. The minimum loudness (per a frequency bin) corresponds to -range_db. ref_db: Sets the reference maximum perceptual loudness as given by (A_weighting + 10 * log10(abs(stft(audio))**2.0). The default value corresponds to white noise with amplitude=1.0 and n_fft=2048. There is a slight dependence on fft_size due to different granularity of perceptual weighting. use_tf: Make function differentiable by using librosa. Returns: Loudness in decibels. Shape [batch_size, n_frames] or [n_frames,]. """ # Pick tensorflow or numpy. lib = tf if use_tf else np # Make inputs tensors for tensorflow. audio = tf_float32(audio) if use_tf else audio # Temporarily a batch dimension for single examples. is_1d = (len(audio.shape) == 1) audio = audio[lib.newaxis, :] if is_1d else audio # Take STFT. hop_size = sample_rate // frame_rate overlap = 1 - hop_size / n_fft stft_fn = stft if use_tf else stft_np s = stft_fn(audio, frame_size=n_fft, overlap=overlap, pad_end=True) # Compute power amplitude = lib.abs(s) log10 = ( lambda x: tf.math.log(x) / tf.math.log(10.0)) if use_tf else np.log10 amin = 1e-20 # Avoid log(0) instabilities. power_db = log10(lib.maximum(amin, amplitude)) power_db *= 20.0 # Perceptual weighting. frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=n_fft) a_weighting = librosa.A_weighting(frequencies)[lib.newaxis, lib.newaxis, :] loudness = power_db + a_weighting # Set dynamic range. loudness -= ref_db loudness = lib.maximum(loudness, -range_db) mean = tf.reduce_mean if use_tf else np.mean # Average over frequency bins. loudness = mean(loudness, axis=-1) # Remove temporary batch dimension. loudness = loudness[0] if is_1d else loudness return loudness
def get_spectrogram(audio, rotate=False, size=1024): """Compute logmag spectrogram.""" mag = ddsp.spectral_ops.compute_logmag(tf_float32(audio), size=size) if rotate: mag = np.rot90(mag) return mag
def additive_synthesis(self, amplitudes, frequency_shifts=None, frequency_distribution=None, n_samples=64000, sample_rate=16000, amp_resample_method="window"): '''Generate audio from frame-wise monophonic harmonic oscillator bank. Args: amplitudes: Frame-wise oscillator peak amplitude. Shape [batch_size, n_frames, 1]. frequency_shifts: Harmonic frequency variations (Hz), zero-centered. Total frequency of a harmonic is equal to (frequencies * (1 + frequency_shifts)). Shape [batch_size, n_frames, n_harmonics]. frequency_distribution: Harmonic amplitude variations, ranged zero to one. Total amplitude of a harmonic is equal to (amplitudes * frequency_distribution). Shape [batch_size, n_frames, n_harmonics]. n_samples: Total length of output audio. Interpolates and crops to this. sample_rate: Sample rate. amp_resample_method: Mode with which to resample amplitude envelopes. Returns: audio: Output audio. Shape [batch_size, n_samples, 1] ''' amplitudes = core.tf_float32(amplitudes) batch_size = amplitudes.shape[0] n_frames = amplitudes.shape[1] if frequency_distribution is not None: frequency_distribution = core.tf_float32(frequency_distribution) n_frequencies = int(frequency_distribution.shape[-1]) elif harmonic_shifts is not None: harmonic_shifts = core.tf_float32(harmonic_shifts) n_frequencies = int(frequency_shifts.shape[-1]) else: n_frequencies = 1 # Create frequencies [batch_size, n_frames, n_frequencies]. frequencies = self.get_linear_frequencies(batch_size, n_frames, n_frequencies) if frequency_shifts is not None: frequencies *= (1.0 + harmonic_shifts) # Create harmonic amplitudes [batch_size, n_frames, n_frequencies]. if frequency_distribution is not None: frequency_amplitudes = amplitudes * frequency_distribution else: frequency_amplitudes = amplitudes # Create sample-wise envelopes. frequency_envelopes = core.resample(frequencies, n_samples) # cycles/sec amplitude_envelopes = core.resample(frequency_amplitudes, n_samples, method=amp_resample_method) # Synthesize from harmonics [batch_size, n_samples]. audio = core.oscillator_bank(frequency_envelopes, amplitude_envelopes, sample_rate=sample_rate) return audio
def compute_loudness(audio, sample_rate=16000, frame_rate=250, n_fft=512, range_db=DB_RANGE, ref_db=0.0, use_tf=True, pad_end=True): """Perceptual loudness (weighted power) in dB. Function is differentiable if use_tf=True. Args: audio: Numpy ndarray or tensor. Shape [batch_size, audio_length] or [batch_size,]. sample_rate: Audio sample rate in Hz. frame_rate: Rate of loudness frames in Hz. n_fft: Fft window size. range_db: Sets the dynamic range of loudness in decibles. The minimum loudness (per a frequency bin) corresponds to -range_db. ref_db: Sets the reference maximum perceptual loudness as given by (A_weighting + 10 * log10(abs(stft(audio))**2.0). The old (<v2.0.0) default value corresponded to white noise with amplitude=1.0 and n_fft=2048. With v2.0.0 it was set to 0.0 to be more consistent with power calculations that have a natural scale for 0 dB being amplitude=1.0. use_tf: Make function differentiable by using tensorflow. pad_end: Add zero padding at end of audio (like `same` convolution). Returns: Loudness in decibels. Shape [batch_size, n_frames] or [n_frames,]. """ if sample_rate % frame_rate != 0: raise ValueError( 'frame_rate: {} must evenly divide sample_rate: {}.' 'For default frame_rate: 250Hz, suggested sample_rate: 16kHz or 48kHz' .format(frame_rate, sample_rate)) # Pick tensorflow or numpy. lib = tf if use_tf else np reduce_mean = tf.reduce_mean if use_tf else np.mean stft_fn = stft if use_tf else stft_np # Make inputs tensors for tensorflow. audio = tf_float32(audio) if use_tf else audio # Temporarily a batch dimension for single examples. is_1d = (len(audio.shape) == 1) audio = audio[lib.newaxis, :] if is_1d else audio # Take STFT. hop_size = sample_rate // frame_rate overlap = 1 - hop_size / n_fft s = stft_fn(audio, frame_size=n_fft, overlap=overlap, pad_end=pad_end) # Compute power. amplitude = lib.abs(s) power = amplitude**2 # Perceptual weighting. frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=n_fft) a_weighting = librosa.A_weighting(frequencies)[lib.newaxis, lib.newaxis, :] # Perform weighting in linear scale, a_weighting given in decibels. weighting = 10**(a_weighting / 10) power = power * weighting # Average over frequencies (weighted power per a bin). avg_power = reduce_mean(power, axis=-1) loudness = core.power_to_db(avg_power, ref_db=ref_db, range_db=range_db, use_tf=use_tf) # Remove temporary batch dimension. loudness = loudness[0] if is_1d else loudness # Compute expected length of loudness vector. expected_secs = audio.shape[-1] / float(sample_rate) expected_len = int(expected_secs * frame_rate) # Pad with `-range_db` noise floor or trim vector. loudness = pad_or_trim_to_expected_length(loudness, expected_len, -range_db, use_tf=use_tf) return loudness
def call(self, audio, target_audio): audio, target_audio = tf_float32(audio), tf_float32(target_audio) target_emb = self.pretrained_model(target_audio) synth_emb = self.pretrained_model(audio) loss = self.weight * mean_difference(target_emb, synth_emb, self.loss_type) return loss