def update_state(self, batch, f0_hz_predict): """Update metrics based on a batch of audio. Args: batch: Dictionary of input features. f0_hz_predict: Batch of encoded f0, same as input f0 if no f0 encoder. """ batch_size = int(f0_hz_predict.shape[0]) # Match number of timesteps. if f0_hz_predict.shape[1] != batch['f0_hz'].shape[1]: # f0_hz_predict = core.resample(f0_hz_predict, # batch['f0_hz'].shape[1]).numpy() batch['f0_hz'] = core.resample(batch['f0_hz'], f0_hz_predict.shape[1]).numpy() batch['f0_confidence'] = core.resample( batch['f0_confidence'], f0_hz_predict.shape[1]).numpy() # Compute metrics per sample. No batch operations possible. for i in range(batch_size): f0_hz_gt = batch['f0_hz'][i] f0_conf_gt = batch['f0_confidence'][i] if not is_outlier(f0_conf_gt): # Gound truth f0 was reliable, proceed with metrics # Compute distance between original f0_hz labels and f0 encoder values. # Resample if f0 encoder has different number of time steps. # TODO(hanoih): compare f0_hz against frame_rate * len_sec f0_hz = f0_hz_predict[i] f0_dist = f0_dist_conf_thresh(f0_hz_gt, f0_hz, f0_conf_gt) self.metrics['f0_dist'].update_state(f0_dist) f0_hz_gt = np.squeeze(f0_hz_gt) f0_hz = np.squeeze(f0_hz) voiced_gt = mir_eval.melody.freq_to_voicing(f0_hz_gt)[1] cents_gt = mir_eval.melody.hz2cents(f0_hz_gt) cents_est = mir_eval.melody.hz2cents(f0_hz) rca = mir_eval.melody.raw_chroma_accuracy( voiced_gt, cents_gt, voiced_gt, cents_est, cent_tolerance=self._rpa_tolerance) rpa = mir_eval.melody.raw_pitch_accuracy( voiced_gt, cents_gt, voiced_gt, cents_est, cent_tolerance=self._rpa_tolerance) self.metrics['raw_chroma_accuracy'].update_state(rca) self.metrics['raw_pitch_accuracy'].update_state(rpa) log_str = ( f'{self._name} | sample {i} | f0_dist(midi): {f0_dist:.3f} ' f'raw_chroma_accuracy: {rca:.3f} ' f'raw_pitch_accuracy: {rpa:.3f}') logging.info(log_str)
def get_controls(self, signal_one: tf.Tensor, signal_two: tf.Tensor, nn_out_mix_level: tf.Tensor) -> TensorDict: """Standardize inputs to same length, mix_level to range [0, 1]. Args: signal_one: 2-D or 3-D tensor. signal_two: 2-D or 3-D tensor. nn_out_mix_level: Tensor of shape [batch, n_time, 1] output of the network determining relative levels of signal one and two. Returns: Dict of control parameters. Raises: ValueError: If signal_one and signal_two are not the same length. """ n_time_one = int(signal_one.shape[1]) n_time_two = int(signal_two.shape[1]) if n_time_one != n_time_two: raise ValueError( 'The two signals must have the same length instead of' '{} and {}'.format(n_time_one, n_time_two)) mix_level = tf.nn.sigmoid(nn_out_mix_level) mix_level = core.resample(mix_level, n_time_one) return { 'signal_one': signal_one, 'signal_two': signal_two, 'mix_level': mix_level }
def f0_summary(f0_hz, f0_hz_predict, step, name='f0_midi'): """Creates a plot comparison of ground truth f0_hz and predicted values.""" batch_size = int(f0_hz.shape[0]) # Resample predictions to match ground truth if they don't already. if f0_hz.shape[1] != f0_hz_predict.shape[1]: f0_hz_predict = core.resample(f0_hz_predict, f0_hz.shape[1]) for i in range(batch_size): f0_midi = core.hz_to_midi(tf.squeeze(f0_hz[i])) f0_midi_predict = core.hz_to_midi(tf.squeeze(f0_hz_predict[i])) # Manually specify exact size of fig for tensorboard fig, (ax0, ax1) = plt.subplots(1, 2, figsize=(6.0, 2.0)) ax0.plot(f0_midi) ax0.plot(f0_midi_predict) ax0.set_title('original vs. predicted') ax1.plot(f0_midi_predict) ax1.set_title('predicted') # Format and save plot to image name = name + '_' if name else '' tag = f'f0_midi/{name}_{i + 1}' fig_summary(tag, fig, step)
def expand_z(self, z, time_steps): """Make sure z has same temporal resolution as other conditioning.""" # Add time dim of z if necessary. if len(z.shape) == 2: z = z[:, tf.newaxis, :] # Expand time dim of z if necessary. z_time_steps = int(z.shape[1]) if z_time_steps != time_steps: z = core.resample(z, time_steps) return z
def _default_processing(self, features): """Always resample to `time_steps` and scale 'loudness_db' and 'f0_hz'.""" for k in ['loudness_db', 'f0_hz']: features[k] = at_least_3d(features[k]) features[k] = core.resample(features[k], n_timesteps=self.time_steps) # For NN training, scale frequency and loudness to the range [0, 1]. # Log-scale f0 features. Loudness from [-1, 0] to [1, 0]. features['f0_scaled'] = hz_to_midi(features['f0_hz']) / F0_RANGE features['ld_scaled'] = (features['loudness_db'] / LD_RANGE) + 1.0 return features
def get_signal(self, amplitudes, wavetables, f0_hz): """Synthesize audio with additive synthesizer from controls. Args: amplitudes: Amplitude tensor of shape [batch, n_frames, 1]. Expects float32 that is strictly positive. wavetables: Tensor of shape [batch, n_frames, n_wavetable]. f0_hz: The fundamental frequency in Hertz. Tensor of shape [batch, n_frames, 1]. Returns: signal: A tensor of of shape [batch, n_samples]. """ wavetables = core.resample(wavetables, self.n_samples) signal = core.wavetable_synthesis(amplitudes=amplitudes, wavetables=wavetables, frequencies=f0_hz, n_samples=self.n_samples, sample_rate=self.sample_rate) return signal
def compute_f0(self, conditioning): """Compute fundamental frequency.""" mag = self.spectral_fn(conditioning['audio']) mag = mag[:, :, :, tf.newaxis] x = self.resnet(mag) # Collapse the frequency dimension x_shape = x.shape.as_list() y = tf.reshape(x, [x_shape[0], x_shape[1], -1]) # Project to f0_bins y = self.dense_out(y) # treat the NN output as probability over midi values. # probs = tf.nn.softmax(y) # softmax leads to NaNs probs = tf.nn.softplus(y) + 1e-3 probs = probs / tf.reduce_sum(probs, axis=-1, keepdims=True) f0 = self._compute_unit_midi(probs) # Make same time resolution as original CREPE f0. n_timesteps = int(conditioning['f0_scaled'].shape[1]) f0 = core.resample(f0, n_timesteps) return f0