예제 #1
0
파일: losses.py 프로젝트: pnutnam/ddsp
def freq_loss(f_hz, f_hz_target, loss_type='L1', weights=None):
  """Loss comparing two frequencies."""
  # Convert to MIDI.
  f_midi = hz_to_midi(f_hz)
  f_midi_target = hz_to_midi(f_hz_target)
  # Take the difference.
  return mean_difference(f_midi, f_midi_target, loss_type, weights)
예제 #2
0
    def call(self, amps_a, freqs_a, amps_b, freqs_b):
        """Returns the sinusoidal consistency loss scalar.

    Args:
      amps_a: Amplitudes of first sinusoids, greater than 0.
        Shape [batch, time, freq].
      freqs_a: Frequencies of first sinusoids in hertz.
        Shape [batch, time, feq].
      amps_b: Amplitudes of second sinusoids, greater than 0.
        Shape [batch, time, freq].
      freqs_b: Frequencies of second sinusoids in hertz.
        Shape [batch, time, feq].

    Returns:
      Scalar, weighted wasserstein distance.
    """
        loss = 0.0
        if self.weight > 0.0:
            if self.midi:
                freqs_a = hz_to_midi(freqs_a)
                freqs_b = hz_to_midi(freqs_b)
                loss = wasserstein_distance(freqs_a,
                                            freqs_b,
                                            amps_a,
                                            amps_b,
                                            p=1.0)
                loss = tf.reduce_mean(self.weight * loss)
        return loss
예제 #3
0
파일: losses.py 프로젝트: pnutnam/ddsp
  def nll(self, amps, freqs, amps_target, freqs_target, scale_target):
    """Returns negative log-likelihood of source sins given target sins.

    Args:
      amps: Amplitudes of source sinusoids, greater than 0.
        Shape [batch, time, freq].
      freqs: Frequencies of source sinusoids in hertz.
        Shape [batch, time, feq].
      amps_target: Amplitudes of target sinusoids, greater than 0.
        Shape [batch, time, freq].
      freqs_target: Frequencies of target sinusoids in hertz.
        Shape [batch, time, feq].
      scale_target: Scale of gaussian kernel in MIDI.

    Returns:
      - log(p(source|target)). Shape [batch, time].
    """
    p_source_given_target = self.kernel_density_estimate(
        amps_target, freqs_target, scale_target)

    # KDE is on a logarithmic scale (MIDI).
    freqs_midi = hz_to_midi(freqs)

    # Need to rearrage shape as tfp expects, [sample_sh, batch_sh, event_sh].
    freqs_transpose = tf.transpose(freqs_midi, [2, 0, 1])  # [freq, batch, time]
    nll_transpose = - p_source_given_target.log_prob(freqs_transpose)
    nll = tf.transpose(nll_transpose, [1, 2, 0])  # [batch, time, freq]

    # Weighted sum over sinusoids -> [batch, time]
    amps_norm = safe_divide(amps, tf.reduce_sum(amps, axis=-1, keepdims=True))
    return tf.reduce_mean(nll * amps_norm, axis=-1)
예제 #4
0
 def test_hz_to_midi_is_accurate(self):
     """Tests converting between MIDI values and their frequencies in hertz."""
     hz = np.linspace(20.0, 20000.0, 128)
     librosa_midi = librosa.hz_to_midi(hz)
     with self.cached_session() as sess:
         tf_midi = sess.run(core.hz_to_midi(hz))
     self.assertAllClose(librosa_midi, tf_midi)
예제 #5
0
파일: core_test.py 프로젝트: magenta/ddsp
 def test_hz_to_midi_is_accurate(self):
   """Tests converting between MIDI values and their frequencies in hertz."""
   hz = np.linspace(0.0, 20000.0, 128)
   librosa_midi = librosa.hz_to_midi(hz)
   librosa_midi = tf.where(tf.less_equal(hz, 0.0), 0.0, librosa_midi)
   tf_midi = core.hz_to_midi(hz)
   self.assertAllClose(librosa_midi, tf_midi)
예제 #6
0
def f0_summary(f0_hz, f0_hz_predict, step, name=''):
  """Creates a plot comparison of ground truth f0_hz and predicted values."""
  batch_size = int(f0_hz.shape[0])

  for i in range(batch_size):
    f0_midi = hz_to_midi(squeeze(f0_hz[i]))
    f0_midi_predict = hz_to_midi(squeeze(f0_hz_predict[i]))
    # Manually specify exact size of fig for tensorboard
    fig, ax = plt.subplots(1, 1, figsize=(2.5, 2.5))
    ax.plot(f0_midi)
    ax.plot(f0_midi_predict)

    # Format and save plot to image
    name = name + '_' if name else ''
    tag = 'f0_midi/{}{}'.format(name, i + 1)
    fig_summary(tag, fig, step)
예제 #7
0
    def _default_processing(self, features):
        '''Always resample to time_steps and scale input signals.'''
        for k in [
                "f0", "phase", "phase_unwrapped", "osc", "osc_sub",
                "phase_sub", "phase_unwrapped_sub", "osc_sub_sync",
                "phase_unwrapped_sub_sync", "phase_sub_sync"
        ]:
            if features.get(k, None) is not None:
                features[k] = at_least_3d(features[k])
                features[k] = resample(features[k],
                                       n_timesteps=self.time_steps)

        # Divide by denom (e.g. number of cylinders in engine to produce subharmonics)
        features["f0_sub"] = features["f0"] / self.denom

        # Set additive input
        features["f0_additive"] = features["f0_sub"]

        # Prepare decoder network inputs
        features["f0_scaled"] = hz_to_midi(features["f0"]) / F0_RANGE
        features["f0_scaled_mel"] = hz_to_mel(features["f0"]) / F0_RANGE_MEL
        features["f0_sub_scaled"] = hz_to_mel(
            features["f0_sub"]) / F0_SUB_RANGE
        for k in ["phase", "phase_sub", "phase_sub_sync"]:
            if features.get(k, None) is not None:
                features[k + "_scaled"] = 0.5 + 0.5 * features[k] / np.pi
        for k in ["osc", "osc_sub", "osc_sub_sync"]:
            if features.get(k, None) is not None:
                features[k + "_scaled"] = 0.5 + 0.5 * features[k]

        return features
예제 #8
0
파일: losses.py 프로젝트: pnutnam/ddsp
 def get_candidate_harmonics(self, f0_candidates, as_midi=True):
   """Build a harmonic series off of each candidate partial."""
   n = tf.range(1, self.n_harmonic_points + 1, dtype=tf.float32)
   # -> [batch, time, candidate, harmonic]
   harmonics = (f0_candidates[:, :, :, tf.newaxis] *
                n[tf.newaxis, tf.newaxis, tf.newaxis, :])
   if as_midi:
     harmonics = hz_to_midi(harmonics)
   return harmonics
예제 #9
0
파일: encoders.py 프로젝트: nielsrolf/ddsp
 def call(self, audio) -> ['z', 'f0_binned', 'f0_scaled', 'f0_hz']:
     x = self.audio_feature_extractor(audio)
     z = self.z_out(x)
     t_steps = x.shape[1]
     z = ddsp.core.resample(z, t_steps)
     f0_binned_logits = self.f0_out(x)
     f0_binned = tf.nn.softmax(f0_binned_logits)
     # TODO correlate neighbouring bins via 1d convolution along bin axis
     f0_hz = self.f0_to_bins.invert(f0_binned)
     f0_scaled = hz_to_midi(f0_hz) / F0_RANGE
     return z, f0_binned, f0_scaled, f0_hz
예제 #10
0
    def _default_processing(self, features):
        '''Always resample to time_steps and scale f0 signal.'''
        # Make sure inputs have the right dimensions, i.e. [batch_size, n_frames, {context dependent}]
        for k in [
                "f0", "phase", "phase_unwrapped", "osc", "osc_sub",
                "phase_sub", "phase_unwrapped_sub", "osc_sub_sync",
                "phase_unwrapped_sub_sync", "phase_sub_sync"
        ]:
            if features.get(k, None) is not None:
                features[k] = at_least_3d(features[k])
                features[k] = resample(features[k],
                                       n_timesteps=self.time_steps)

        # Divide by denom (e.g. number of cylinders in engine to produce subharmonics)
        features["f0_sub"] = features["f0"] / self.denom

        # Set additive input
        features["f0_additive"] = features[self.f0_additive]

        # Generate osc and phase from f0 if missing
        for suffix in ["", "_sub"]:
            if features.get("osc" + suffix, None) is None:
                amplitudes = tf.ones(tf.shape(features["f0" + suffix]))
                features["osc" + suffix] = oscillator_bank(
                    features["f0" + suffix], amplitudes,
                    sample_rate=self.rate)[:, :, tf.newaxis]
            if features.get("phase" + suffix, None) is None:
                omegas = 2.0 * np.pi * features["f0" + suffix] / float(
                    self.rate)
                phases = tf.cumsum(omegas, axis=1)
                features["phase_unwrapped" + suffix] = phases
                phases_wrapped = tf.math.mod(phases + np.pi, 2 * np.pi) - np.pi
                features["phase" + suffix] = phases_wrapped

        for prefix in ["osc_sub", "phase_sub", "phase_unwrapped_sub"]:
            if features.get(prefix + "_sync", None) is None:
                features[prefix + "_sync"] = features[prefix]

        # Prepare decoder network inputs
        features["f0_scaled"] = hz_to_midi(features["f0"]) / F0_RANGE
        features["f0_scaled_mel"] = hz_to_mel(features["f0"]) / F0_RANGE_MEL
        features["f0_sub_scaled"] = hz_to_mel(
            features["f0_sub"]) / F0_SUB_RANGE
        for k in ["phase", "phase_sub", "phase_sub_sync"]:
            if features.get(k, None) is not None:
                features[k + "_scaled"] = 0.5 + 0.5 * features[k] / np.pi
        for k in ["osc", "osc_sub", "osc_sub_sync"]:
            if features.get(k, None) is not None:
                features[k + "_scaled"] = 0.5 + 0.5 * features[k]

        return features
예제 #11
0
    def _default_processing(self, features):
        '''Always resample to time_steps and scale f0 signal.'''
        features["f0"] = at_least_3d(features["f0"])
        features["f0"] = resample(features["f0"], n_timesteps=self.time_steps)

        # Divide by denom (e.g. number of cylinders in engine to produce subharmonics)
        features["f0"] /= self.denom

        # Set additive input
        features["f0_additive"] = features["f0"]

        # Prepare decoder network inputs
        if self.feature_domain == "freq":
            features["f0_scaled"] = hz_to_midi(features["f0"]) / F0_RANGE
        elif self.feature_domain == "freq-old":
            '''DEPRICATED. This option is for backward compability with a version containing a typo.'''
            features["f0_scaled"] = hz_to_midi(
                self.denom * features["f0"]) / F0_RANGE / self.denom
        elif self.feature_domain == "time":
            amplitudes = tf.ones(tf.shape(features["f0"]))
            features["f0_scaled"] = oscillator_bank(
                features["f0"], amplitudes, sample_rate=self.rate)[:, :,
                                                                   tf.newaxis]
        elif self.feature_domain == "osc":
            if features.get("osc", None) is None:
                amplitudes = tf.ones(tf.shape(features["f0"]))
                features["f0_scaled"] = oscillator_bank(
                    self.denom * features["f0"],
                    amplitudes,
                    sample_rate=self.rate)[:, :, tf.newaxis]
            else:
                features["f0_scaled"] = features["osc"][:, :, tf.newaxis]
        else:
            raise ValueError("%s is not a valid value for feature_domain." %
                             self.feature_domain)

        return features
예제 #12
0
def prepare_tfrecord_no_beam(input_audio_paths,
                             output_tfrecord_path,
                             num_shards=None,
                             sample_rate=16000,
                             frame_rate=250,
                             window_secs=4,
                             hop_secs=1,
                             pipeline_options=''):
    if num_shards is not None or pipeline_options != '':
        logging.warning(
            'num_shards and pipeline_options arguments are not supported if not using apache beam!'
        )
    examples = do_multiprocess(partial(_load_audio, sample_rate=sample_rate),
                               input_audio_paths)

    examples = [_add_f0_estimate(ex, frame_rate) for ex in examples]
    pitch_mean = np.mean(
        np.concatenate([hz_to_midi(item['f0_hz']) for item in examples]))
    examples = do_multiprocess(
        partial(_add_loudness, sample_rate=sample_rate, frame_rate=frame_rate),
        examples)
    loudness_avg_max = np.mean(
        [np.max(item['loudness_db']) for item in examples])
    loudness_mean = np.mean(
        np.concatenate([item['loudness_db'] for item in examples]))
    split_examples = []
    for ex in examples:
        split = _split_example(ex, sample_rate, frame_rate, window_secs,
                               hop_secs)
        for s in split:
            split_examples.append(s)

    tfexamples = do_multiprocess(_float_dict_to_tfexample, split_examples)

    with tf.io.TFRecordWriter(output_tfrecord_path) as writer:
        for ex in tfexamples:
            writer.write(ex.SerializeToString())

    print(f'model_pitch_mean: {pitch_mean}')
    print(f'model_loudness_avg_max: {loudness_avg_max}')
    print(f'model_loudness_mean: {loudness_mean}')
예제 #13
0
파일: losses.py 프로젝트: pnutnam/ddsp
  def get_p_harmonics_given_sinusoids(self, freqs, amps):
    """Gets distribution of harmonics from candidate f0s given sinusoids.

    Performs a gaussian kernel density estimate on the sinusoid points, with the
    height of each gaussian component given by the sinusoidal amplitude.
    Args:
      freqs: Frequencies of sinusoids in hertz.
      amps: Amplitudes of sinusoids, must be greater than 0.

    Returns:
      MixtureSameFamily, Gaussian distribution.
    """
    # Gaussian KDE around each partial, height=amplitude, center=frequency.
    sinusoids_midi = hz_to_midi(freqs)

    # NLL can be a nan if sinusoid amps are all zero, add a small offset.
    amps = tf.where(amps == 0.0, 1e-7 * tf.ones_like(amps), amps)
    amps_norm = safe_divide(amps, tf.reduce_sum(amps, axis=-1, keepdims=True))

    # P(candidate_harmonics | sinusoids)
    return tfd.MixtureSameFamily(
        tfd.Categorical(probs=amps_norm),
        tfd.Normal(loc=sinusoids_midi, scale=self.sinusoids_scale))
예제 #14
0
파일: losses.py 프로젝트: pnutnam/ddsp
  def get_loss_tensors(self, f0_candidates, freqs, amps):
    """Get traces of loss to estimate fundamental frequency.

    Args:
      f0_candidates: Frequencies of candidates in hertz. [batch, time, freq].
      freqs: Frequencies of sinusoids in hertz. [batch, time, feq].
      amps: Amplitudes of sinusoids, greater than 0. [batch, time, freq].

    Returns:
      sinusoids_loss: -log p(sinusoids|harmonics), [batch, time, f0_candidate].
      harmonics_loss: - log p(harmonics|sinusoids), [batch, time, f0_candidate].
    """
    # ==========================================================================
    # P(sinusoids | candidate_harmonics).
    # ==========================================================================
    p_sinusoids_given_harmonics = self.get_p_sinusoids_given_harmonics()

    # Treat each partial as a candidate.
    # Get the ratio of each partial to each candidate.
    # -> [batch, time, candidate, partial]
    freq_ratios = safe_divide(freqs[:, :, tf.newaxis, :],
                              f0_candidates[:, :, :, tf.newaxis])
    nll_sinusoids = - p_sinusoids_given_harmonics.log_prob(freq_ratios)

    a = tf.convert_to_tensor(amps[:, :, tf.newaxis, :])

    # # Don't count sinusoids that are less than 1 std > mean.
    # a_mean, a_var = tf.nn.moments(a, axes=-1, keepdims=True)
    # a = tf.where(a > a_mean + 0.5 * a_var**0.5, a, tf.zeros_like(a))

    # Weighted sum by sinusoid amplitude.
    # -> [batch, time, candidate]
    sinusoids_loss = safe_divide(tf.reduce_sum(nll_sinusoids * a, axis=-1),
                                 tf.reduce_sum(a, axis=-1))

    # ==========================================================================
    # P(candidate_harmonics | sinusoids)
    # ==========================================================================
    p_harm_given_sin = self.get_p_harmonics_given_sinusoids(freqs, amps)
    harmonics = self.get_candidate_harmonics(f0_candidates, as_midi=True)

    # Need to rearrage shape as tfp expects, [sample_sh, batch_sh, event_sh].
    # -> [candidate, harmonic, batch, time]
    harmonics_transpose = tf.transpose(harmonics, [2, 3, 0, 1])
    nll_harmonics_transpose = - p_harm_given_sin.log_prob(harmonics_transpose)
    # -> [batch, time, candidate, harm]
    nll_harmonics = tf.transpose(nll_harmonics_transpose, [2, 3, 0, 1])

    # Prior decreasing importance of upper harmonics.
    amps_prior = tf.linspace(
        1.0, 1.0 / self.n_harmonic_points, self.n_harmonic_points)
    harmonics_loss = (nll_harmonics *
                      amps_prior[tf.newaxis, tf.newaxis, tf.newaxis, :])

    # Don't count loss for harmonics above nyquist.
    # Reweight by the number of harmonics below nyquist,
    # (so it doesn't just pick the highest frequency possible).
    nyquist_midi = hz_to_midi(self.sample_rate / 2.0)
    nyquist_mask = tf.where(harmonics < nyquist_midi,
                            tf.ones_like(harmonics_loss),
                            tf.zeros_like(harmonics_loss))
    harmonics_loss *= safe_divide(
        nyquist_mask, tf.reduce_mean(nyquist_mask, axis=-1, keepdims=True))

    # Sum over harmonics.
    harmonics_loss = tf.reduce_mean(harmonics_loss, axis=-1)

    return sinusoids_loss, harmonics_loss