Exemplo n.º 1
0
def spectrogram_summary(audio, audio_gen, step, name='', tag='spectrogram'):
    """Writes a summary of spectrograms for a batch of images."""
    specgram = lambda a: ddsp.spectral_ops.compute_logmag(tf_float32(a),
                                                          size=768)

    # Batch spectrogram operations
    spectrograms = specgram(audio)
    spectrograms_gen = specgram(audio_gen)

    batch_size = int(audio.shape[0])

    for i in range(batch_size):
        # Manually specify exact size of fig for tensorboard
        fig, axs = plt.subplots(2, 1, figsize=(8, 8))

        _plt_spec(spectrograms[i], axs[0], 'original')
        _plt_spec(spectrograms_gen[i], axs[1], 'synthesized')

        # Format and save plot to image
        fig_summary(fig, plot_type='spectrogram', sample_idx=i + 1, **data)
Exemplo n.º 2
0
def compute_rms_energy(audio,
                       sample_rate=16000,
                       frame_rate=250,
                       frame_size=2048,
                       pad_end=True):
    """Compute root mean squared energy of audio."""
    audio = tf_float32(audio)
    hop_size = sample_rate // frame_rate
    audio_frames = tf.signal.frame(audio,
                                   frame_size,
                                   hop_size,
                                   pad_end=pad_end)
    rms_energy = tf.reduce_mean(audio_frames**2.0, axis=-1)**0.5
    if pad_end:
        n_samples = audio.shape[0] if len(audio.shape) == 1 else audio.shape[1]
        n_secs = n_samples / float(
            sample_rate)  # `n_secs` can have milliseconds
        expected_len = int(n_secs * frame_rate)
        return pad_or_trim_to_expected_length(rms_energy,
                                              expected_len,
                                              use_tf=True)
    else:
        return rms_energy
Exemplo n.º 3
0
def compute_mag(audio, size=2048, overlap=0.75, pad_end=True):
    mag = tf.abs(stft(audio, frame_size=size, overlap=overlap,
                      pad_end=pad_end))
    return tf_float32(mag)
Exemplo n.º 4
0
def compute_loudness(audio,
                     sample_rate=16000,
                     frame_rate=250,
                     n_fft=2048,
                     range_db=LD_RANGE,
                     ref_db=20.7,
                     use_tf=False):
    """Perceptual loudness in dB, relative to white noise, amplitude=1.

  Function is differentiable if use_tf=True.
  Args:
    audio: Numpy ndarray or tensor. Shape [batch_size, audio_length] or
      [batch_size,].
    sample_rate: Audio sample rate in Hz.
    frame_rate: Rate of loudness frames in Hz.
    n_fft: Fft window size.
    range_db: Sets the dynamic range of loudness in decibles. The minimum
      loudness (per a frequency bin) corresponds to -range_db.
    ref_db: Sets the reference maximum perceptual loudness as given by
      (A_weighting + 10 * log10(abs(stft(audio))**2.0). The default value
      corresponds to white noise with amplitude=1.0 and n_fft=2048. There is a
      slight dependence on fft_size due to different granularity of perceptual
      weighting.
    use_tf: Make function differentiable by using tensorflow.

  Returns:
    Loudness in decibels. Shape [batch_size, n_frames] or [n_frames,].
  """
    if sample_rate % frame_rate != 0:
        raise ValueError(
            'frame_rate: {} must evenly divide sample_rate: {}.'
            'For default frame_rate: 250Hz, suggested sample_rate: 16kHz or 48kHz'
            .format(frame_rate, sample_rate))

    # Pick tensorflow or numpy.
    lib = tf if use_tf else np

    # Make inputs tensors for tensorflow.
    audio = tf_float32(audio) if use_tf else audio

    # Temporarily a batch dimension for single examples.
    is_1d = (len(audio.shape) == 1)
    audio = audio[lib.newaxis, :] if is_1d else audio

    # Take STFT.
    hop_size = sample_rate // frame_rate
    overlap = 1 - hop_size / n_fft
    stft_fn = stft if use_tf else stft_np
    s = stft_fn(audio, frame_size=n_fft, overlap=overlap, pad_end=True)

    # Compute power.
    amplitude = lib.abs(s)
    power_db = amplitude_to_db(amplitude, use_tf=use_tf)

    # Perceptual weighting.
    frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=n_fft)
    a_weighting = librosa.A_weighting(frequencies)[lib.newaxis, lib.newaxis, :]
    loudness = power_db + a_weighting

    # Set dynamic range.
    loudness -= ref_db
    loudness = lib.maximum(loudness, -range_db)
    mean = tf.reduce_mean if use_tf else np.mean

    # Average over frequency bins.
    loudness = mean(loudness, axis=-1)

    # Remove temporary batch dimension.
    loudness = loudness[0] if is_1d else loudness

    # Compute expected length of loudness vector
    n_secs = audio.shape[-1] / float(
        sample_rate)  # `n_secs` can have milliseconds
    expected_len = int(n_secs * frame_rate)

    # Pad with `-range_db` noise floor or trim vector
    loudness = pad_or_trim_to_expected_length(loudness,
                                              expected_len,
                                              -range_db,
                                              use_tf=use_tf)
    return loudness
Exemplo n.º 5
0
def compute_loudness(audio,
                     sample_rate=16000,
                     frame_rate=250,
                     n_fft=2048,
                     range_db=LD_RANGE,
                     ref_db=20.7,
                     use_tf=False):
    """Perceptual loudness in dB, relative to white noise, amplitude=1.

  Function is differentiable if use_tf=True.
  Args:
    audio: Numpy ndarray or tensor. Shape [batch_size, audio_length] or
      [batch_size,].
    sample_rate: Audio sample rate in Hz.
    frame_rate: Rate of loudness frames in Hz.
    n_fft: Fft window size.
    range_db: Sets the dynamic range of loudness in decibles. The minimum
      loudness (per a frequency bin) corresponds to -range_db.
    ref_db: Sets the reference maximum perceptual loudness as given by
      (A_weighting + 10 * log10(abs(stft(audio))**2.0). The default value
      corresponds to white noise with amplitude=1.0 and n_fft=2048. There is a
      slight dependence on fft_size due to different granularity of perceptual
      weighting.
    use_tf: Make function differentiable by using librosa.

  Returns:
    Loudness in decibels. Shape [batch_size, n_frames] or [n_frames,].
  """
    # Pick tensorflow or numpy.
    lib = tf if use_tf else np

    # Make inputs tensors for tensorflow.
    audio = tf_float32(audio) if use_tf else audio

    # Temporarily a batch dimension for single examples.
    is_1d = (len(audio.shape) == 1)
    audio = audio[lib.newaxis, :] if is_1d else audio

    # Take STFT.
    hop_size = sample_rate // frame_rate
    overlap = 1 - hop_size / n_fft
    stft_fn = stft if use_tf else stft_np
    s = stft_fn(audio, frame_size=n_fft, overlap=overlap, pad_end=True)

    # Compute power
    amplitude = lib.abs(s)
    log10 = (
        lambda x: tf.math.log(x) / tf.math.log(10.0)) if use_tf else np.log10
    amin = 1e-20  # Avoid log(0) instabilities.
    power_db = log10(lib.maximum(amin, amplitude))
    power_db *= 20.0

    # Perceptual weighting.
    frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=n_fft)
    a_weighting = librosa.A_weighting(frequencies)[lib.newaxis, lib.newaxis, :]
    loudness = power_db + a_weighting

    # Set dynamic range.
    loudness -= ref_db
    loudness = lib.maximum(loudness, -range_db)
    mean = tf.reduce_mean if use_tf else np.mean

    # Average over frequency bins.
    loudness = mean(loudness, axis=-1)

    # Remove temporary batch dimension.
    loudness = loudness[0] if is_1d else loudness
    return loudness
Exemplo n.º 6
0
def get_spectrogram(audio, rotate=False, size=1024):
    """Compute logmag spectrogram."""
    mag = ddsp.spectral_ops.compute_logmag(tf_float32(audio), size=size)
    if rotate:
        mag = np.rot90(mag)
    return mag
Exemplo n.º 7
0
    def additive_synthesis(self,
                           amplitudes,
                           frequency_shifts=None,
                           frequency_distribution=None,
                           n_samples=64000,
                           sample_rate=16000,
                           amp_resample_method="window"):
        '''Generate audio from frame-wise monophonic harmonic oscillator bank.

        Args:
            amplitudes: Frame-wise oscillator peak amplitude. Shape [batch_size,
                n_frames, 1].
            frequency_shifts: Harmonic frequency variations (Hz), zero-centered. Total
                frequency of a harmonic is equal to (frequencies * (1 +
                frequency_shifts)). Shape [batch_size, n_frames, n_harmonics].
            frequency_distribution: Harmonic amplitude variations, ranged zero to one.
                Total amplitude of a harmonic is equal to (amplitudes *
                frequency_distribution). Shape [batch_size, n_frames, n_harmonics].
            n_samples: Total length of output audio. Interpolates and crops to this.
            sample_rate: Sample rate.
            amp_resample_method: Mode with which to resample amplitude envelopes.

        Returns:
            audio: Output audio. Shape [batch_size, n_samples, 1]
        '''
        amplitudes = core.tf_float32(amplitudes)
        batch_size = amplitudes.shape[0]
        n_frames = amplitudes.shape[1]

        if frequency_distribution is not None:
            frequency_distribution = core.tf_float32(frequency_distribution)
            n_frequencies = int(frequency_distribution.shape[-1])
        elif harmonic_shifts is not None:
            harmonic_shifts = core.tf_float32(harmonic_shifts)
            n_frequencies = int(frequency_shifts.shape[-1])
        else:
            n_frequencies = 1

        # Create frequencies [batch_size, n_frames, n_frequencies].
        frequencies = self.get_linear_frequencies(batch_size, n_frames,
                                                  n_frequencies)
        if frequency_shifts is not None:
            frequencies *= (1.0 + harmonic_shifts)

        # Create harmonic amplitudes [batch_size, n_frames, n_frequencies].
        if frequency_distribution is not None:
            frequency_amplitudes = amplitudes * frequency_distribution
        else:
            frequency_amplitudes = amplitudes

        # Create sample-wise envelopes.
        frequency_envelopes = core.resample(frequencies,
                                            n_samples)  # cycles/sec
        amplitude_envelopes = core.resample(frequency_amplitudes,
                                            n_samples,
                                            method=amp_resample_method)

        # Synthesize from harmonics [batch_size, n_samples].
        audio = core.oscillator_bank(frequency_envelopes,
                                     amplitude_envelopes,
                                     sample_rate=sample_rate)
        return audio
Exemplo n.º 8
0
def compute_loudness(audio,
                     sample_rate=16000,
                     frame_rate=250,
                     n_fft=512,
                     range_db=DB_RANGE,
                     ref_db=0.0,
                     use_tf=True,
                     pad_end=True):
    """Perceptual loudness (weighted power) in dB.

  Function is differentiable if use_tf=True.
  Args:
    audio: Numpy ndarray or tensor. Shape [batch_size, audio_length] or
      [batch_size,].
    sample_rate: Audio sample rate in Hz.
    frame_rate: Rate of loudness frames in Hz.
    n_fft: Fft window size.
    range_db: Sets the dynamic range of loudness in decibles. The minimum
      loudness (per a frequency bin) corresponds to -range_db.
    ref_db: Sets the reference maximum perceptual loudness as given by
      (A_weighting + 10 * log10(abs(stft(audio))**2.0). The old (<v2.0.0)
      default value corresponded to white noise with amplitude=1.0 and
      n_fft=2048. With v2.0.0 it was set to 0.0 to be more consistent with power
      calculations that have a natural scale for 0 dB being amplitude=1.0.
    use_tf: Make function differentiable by using tensorflow.
    pad_end: Add zero padding at end of audio (like `same` convolution).

  Returns:
    Loudness in decibels. Shape [batch_size, n_frames] or [n_frames,].
  """
    if sample_rate % frame_rate != 0:
        raise ValueError(
            'frame_rate: {} must evenly divide sample_rate: {}.'
            'For default frame_rate: 250Hz, suggested sample_rate: 16kHz or 48kHz'
            .format(frame_rate, sample_rate))

    # Pick tensorflow or numpy.
    lib = tf if use_tf else np
    reduce_mean = tf.reduce_mean if use_tf else np.mean
    stft_fn = stft if use_tf else stft_np

    # Make inputs tensors for tensorflow.
    audio = tf_float32(audio) if use_tf else audio

    # Temporarily a batch dimension for single examples.
    is_1d = (len(audio.shape) == 1)
    audio = audio[lib.newaxis, :] if is_1d else audio

    # Take STFT.
    hop_size = sample_rate // frame_rate
    overlap = 1 - hop_size / n_fft
    s = stft_fn(audio, frame_size=n_fft, overlap=overlap, pad_end=pad_end)

    # Compute power.
    amplitude = lib.abs(s)
    power = amplitude**2

    # Perceptual weighting.
    frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=n_fft)
    a_weighting = librosa.A_weighting(frequencies)[lib.newaxis, lib.newaxis, :]

    # Perform weighting in linear scale, a_weighting given in decibels.
    weighting = 10**(a_weighting / 10)
    power = power * weighting

    # Average over frequencies (weighted power per a bin).
    avg_power = reduce_mean(power, axis=-1)
    loudness = core.power_to_db(avg_power,
                                ref_db=ref_db,
                                range_db=range_db,
                                use_tf=use_tf)

    # Remove temporary batch dimension.
    loudness = loudness[0] if is_1d else loudness

    # Compute expected length of loudness vector.
    expected_secs = audio.shape[-1] / float(sample_rate)
    expected_len = int(expected_secs * frame_rate)

    # Pad with `-range_db` noise floor or trim vector.
    loudness = pad_or_trim_to_expected_length(loudness,
                                              expected_len,
                                              -range_db,
                                              use_tf=use_tf)

    return loudness
Exemplo n.º 9
0
 def call(self, audio, target_audio):
   audio, target_audio = tf_float32(audio), tf_float32(target_audio)
   target_emb = self.pretrained_model(target_audio)
   synth_emb = self.pretrained_model(audio)
   loss = self.weight * mean_difference(target_emb, synth_emb, self.loss_type)
   return loss