def compute_z(self, *inputs): if self.compute_mfccs: audio_idx = self.input_keys.index('audio') audio = inputs.pop(audio_idx) n_t = inputs[0].shape[1] mfccs = spectral_ops.compute_mfcc(audio, lo_hz=20.0, hi_hz=8000.0, fft_size=self.fft_size, mel_bins=self.mel_bins, mfcc_bins=self.mfcc_bins) mfccs_scaled = self.norm_mfcc(mfccs) mfccs_scaled = ddsp.core.resample(mfccs_scaled, n_t) inputs.append(mfccs_scaled) x = tf.concat(inputs, axis=-1) z = self.net(x) z = self.norm(z) z = self.dense_out(z) if self.pool_time: z = tf.reduce_mean(z, axis=1, keepdims=True) return z
def call(self, audio, *conditioning): if self.spectral_op == 'compute_mfcc': z = spectral_ops.compute_mfcc( audio, lo_hz=20.0, hi_hz=8000.0, fft_size=self.fft_size, mel_bins=128, mfcc_bins=30, overlap=self.overlap, pad_end=True) elif self.spectral_op == 'compute_logmag': z = spectral_ops.compute_logmag(core.tf_float32(audio), size=self.fft_size) # Normalize. z = self.z_norm(z[:, :, tf.newaxis, :])[:, :, 0, :] n_timesteps = z.shape[1] conditioning = [resample(c, n_timesteps) for c in conditioning] z = tf.concat([z] + conditioning, axis=-1) # Run an RNN over the latents. z = self.rnn(z) # Bounce down to compressed z dimensions. w = tf.math.sigmoid(self.confidence(z)) z = self.dense_out(z) z = tf.reduce_sum(z * w, axis=1, keepdims=True) / tf.reduce_sum(w, axis=1, keepdims=True) return z
def compute_z(self, audio): mfccs = [] for fft_size, mel_bin, mfcc_bin in zip(self.fft_sizes, self.mel_bins, self.mfcc_bins): mfcc = spectral_ops.compute_mfcc(audio, lo_hz=20.0, hi_hz=8000.0, fft_size=fft_size, mel_bins=mel_bin, mfcc_bins=mfcc_bin) mfccs.append(ddsp.core.resample(mfcc, self.time_steps)) mfccs = tf.concat(mfccs, axis=-1) return self.nom_out(mfccs[:, :, tf.newaxis, :])[:, :, 0, :]
def compute_z(self, conditioning): mfccs = spectral_ops.compute_mfcc(conditioning['audio'], lo_hz=20.0, hi_hz=8000.0, fft_size=self.fft_size, mel_bins=128, mfcc_bins=30, overlap=self.overlap, pad_end=True) # Normalize. z = self.z_norm(mfccs[:, :, tf.newaxis, :])[:, :, 0, :] # Run an RNN over the latents. z = self.rnn(z) # Bounce down to compressed z dimensions. z = self.dense_out(z) return z
def compute_z(self, audio): mfccs = spectral_ops.compute_mfcc(audio, lo_hz=20.0, hi_hz=8000.0, fft_size=1024, mel_bins=128, mfcc_bins=30) z = self.norm_in(mfccs[:, :, tf.newaxis, :])[:, :, 0, :] if self.mean_aggregate: z = self.rnn(z) z = tf.reduce_mean(z, axis=1, keepdims=True) else: z = self.rnn(z) z = tf.concat(z, axis=-1)[:, tf.newaxis, :] # Bounce down to compressed dimensions. return self.dense_z(z)
def compute_z(self, audio): mfccs = spectral_ops.compute_mfcc( audio, sample_rate=self.sample_rate, lo_hz=4.0, hi_hz=16000.0, fft_size=self.fft_size, mel_bins=128, mfcc_bins=40, overlap=self.overlap, pad_end=True) # Normalize. z = self.z_norm(mfccs[:, :, tf.newaxis, :])[:, :, 0, :] # Run an RNN over the latents. z = self.rnn(z) # Run a tcnn over latents. z = self.tcnn(z) # Bounce down to compressed z dimensions. z = self.dense_out(z) return z