예제 #1
0
    def update_state(self, batch, f0_hz_predict):
        """Update metrics based on a batch of audio.

    Args:
      batch: Dictionary of input features.
      f0_hz_predict: Batch of encoded f0, same as input f0 if no f0 encoder.
    """
        batch_size = int(f0_hz_predict.shape[0])
        # Match number of timesteps.
        if f0_hz_predict.shape[1] != batch['f0_hz'].shape[1]:
            # f0_hz_predict = core.resample(f0_hz_predict,
            #                                    batch['f0_hz'].shape[1]).numpy()
            batch['f0_hz'] = core.resample(batch['f0_hz'],
                                           f0_hz_predict.shape[1]).numpy()
            batch['f0_confidence'] = core.resample(
                batch['f0_confidence'], f0_hz_predict.shape[1]).numpy()

        # Compute metrics per sample. No batch operations possible.
        for i in range(batch_size):
            f0_hz_gt = batch['f0_hz'][i]
            f0_conf_gt = batch['f0_confidence'][i]

            if not is_outlier(f0_conf_gt):
                # Gound truth f0 was reliable, proceed with metrics
                # Compute distance between original f0_hz labels and f0 encoder values.
                # Resample if f0 encoder has different number of time steps.
                # TODO(hanoih): compare f0_hz against frame_rate * len_sec
                f0_hz = f0_hz_predict[i]
                f0_dist = f0_dist_conf_thresh(f0_hz_gt, f0_hz, f0_conf_gt)
                self.metrics['f0_dist'].update_state(f0_dist)

                f0_hz_gt = np.squeeze(f0_hz_gt)
                f0_hz = np.squeeze(f0_hz)
                voiced_gt = mir_eval.melody.freq_to_voicing(f0_hz_gt)[1]
                cents_gt = mir_eval.melody.hz2cents(f0_hz_gt)
                cents_est = mir_eval.melody.hz2cents(f0_hz)
                rca = mir_eval.melody.raw_chroma_accuracy(
                    voiced_gt,
                    cents_gt,
                    voiced_gt,
                    cents_est,
                    cent_tolerance=self._rpa_tolerance)
                rpa = mir_eval.melody.raw_pitch_accuracy(
                    voiced_gt,
                    cents_gt,
                    voiced_gt,
                    cents_est,
                    cent_tolerance=self._rpa_tolerance)
                self.metrics['raw_chroma_accuracy'].update_state(rca)
                self.metrics['raw_pitch_accuracy'].update_state(rpa)
                log_str = (
                    f'{self._name} | sample {i} | f0_dist(midi): {f0_dist:.3f} '
                    f'raw_chroma_accuracy: {rca:.3f} '
                    f'raw_pitch_accuracy: {rpa:.3f}')
                logging.info(log_str)
예제 #2
0
    def get_controls(self, signal_one: tf.Tensor, signal_two: tf.Tensor,
                     nn_out_mix_level: tf.Tensor) -> TensorDict:
        """Standardize inputs to same length, mix_level to range [0, 1].

    Args:
      signal_one: 2-D or 3-D tensor.
      signal_two: 2-D or 3-D tensor.
      nn_out_mix_level: Tensor of shape [batch, n_time, 1] output of the network
        determining relative levels of signal one and two.

    Returns:
      Dict of control parameters.

    Raises:
      ValueError: If signal_one and signal_two are not the same length.
    """
        n_time_one = int(signal_one.shape[1])
        n_time_two = int(signal_two.shape[1])
        if n_time_one != n_time_two:
            raise ValueError(
                'The two signals must have the same length instead of'
                '{} and {}'.format(n_time_one, n_time_two))

        mix_level = tf.nn.sigmoid(nn_out_mix_level)
        mix_level = core.resample(mix_level, n_time_one)
        return {
            'signal_one': signal_one,
            'signal_two': signal_two,
            'mix_level': mix_level
        }
예제 #3
0
def f0_summary(f0_hz, f0_hz_predict, step, name='f0_midi'):
    """Creates a plot comparison of ground truth f0_hz and predicted values."""
    batch_size = int(f0_hz.shape[0])

    # Resample predictions to match ground truth if they don't already.
    if f0_hz.shape[1] != f0_hz_predict.shape[1]:
        f0_hz_predict = core.resample(f0_hz_predict, f0_hz.shape[1])

    for i in range(batch_size):
        f0_midi = core.hz_to_midi(tf.squeeze(f0_hz[i]))
        f0_midi_predict = core.hz_to_midi(tf.squeeze(f0_hz_predict[i]))

        # Manually specify exact size of fig for tensorboard
        fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(6.0, 2.0))
        ax0.plot(f0_midi)
        ax0.plot(f0_midi_predict)
        ax0.set_title('original vs. predicted')

        ax1.plot(f0_midi_predict)
        ax1.set_title('predicted')

        # Format and save plot to image
        name = name + '_' if name else ''
        tag = f'f0_midi/{name}_{i + 1}'
        fig_summary(tag, fig, step)
예제 #4
0
파일: encoders.py 프로젝트: rtchen/ddsp-pbe
 def expand_z(self, z, time_steps):
     """Make sure z has same temporal resolution as other conditioning."""
     # Add time dim of z if necessary.
     if len(z.shape) == 2:
         z = z[:, tf.newaxis, :]
     # Expand time dim of z if necessary.
     z_time_steps = int(z.shape[1])
     if z_time_steps != time_steps:
         z = core.resample(z, time_steps)
     return z
예제 #5
0
 def _default_processing(self, features):
     """Always resample to `time_steps` and scale 'loudness_db' and 'f0_hz'."""
     for k in ['loudness_db', 'f0_hz']:
         features[k] = at_least_3d(features[k])
         features[k] = core.resample(features[k],
                                     n_timesteps=self.time_steps)
     # For NN training, scale frequency and loudness to the range [0, 1].
     # Log-scale f0 features. Loudness from [-1, 0] to [1, 0].
     features['f0_scaled'] = hz_to_midi(features['f0_hz']) / F0_RANGE
     features['ld_scaled'] = (features['loudness_db'] / LD_RANGE) + 1.0
     return features
예제 #6
0
    def get_signal(self, amplitudes, wavetables, f0_hz):
        """Synthesize audio with additive synthesizer from controls.

    Args:
      amplitudes: Amplitude tensor of shape [batch, n_frames, 1]. Expects
        float32 that is strictly positive.
      wavetables: Tensor of shape [batch, n_frames, n_wavetable].
      f0_hz: The fundamental frequency in Hertz. Tensor of shape [batch,
        n_frames, 1].

    Returns:
      signal: A tensor of of shape [batch, n_samples].
    """
        wavetables = core.resample(wavetables, self.n_samples)
        signal = core.wavetable_synthesis(amplitudes=amplitudes,
                                          wavetables=wavetables,
                                          frequencies=f0_hz,
                                          n_samples=self.n_samples,
                                          sample_rate=self.sample_rate)
        return signal
예제 #7
0
파일: encoders.py 프로젝트: rtchen/ddsp-pbe
    def compute_f0(self, conditioning):
        """Compute fundamental frequency."""
        mag = self.spectral_fn(conditioning['audio'])
        mag = mag[:, :, :, tf.newaxis]
        x = self.resnet(mag)

        # Collapse the frequency dimension
        x_shape = x.shape.as_list()
        y = tf.reshape(x, [x_shape[0], x_shape[1], -1])
        # Project to f0_bins
        y = self.dense_out(y)

        # treat the NN output as probability over midi values.
        # probs = tf.nn.softmax(y)  # softmax leads to NaNs
        probs = tf.nn.softplus(y) + 1e-3
        probs = probs / tf.reduce_sum(probs, axis=-1, keepdims=True)
        f0 = self._compute_unit_midi(probs)

        # Make same time resolution as original CREPE f0.
        n_timesteps = int(conditioning['f0_scaled'].shape[1])
        f0 = core.resample(f0, n_timesteps)
        return f0