示例#1
0
文件: encoders.py 项目: ketan0/ddsp
    def compute_z(self, *inputs):
        if self.compute_mfccs:
            audio_idx = self.input_keys.index('audio')
            audio = inputs.pop(audio_idx)
            n_t = inputs[0].shape[1]

            mfccs = spectral_ops.compute_mfcc(audio,
                                              lo_hz=20.0,
                                              hi_hz=8000.0,
                                              fft_size=self.fft_size,
                                              mel_bins=self.mel_bins,
                                              mfcc_bins=self.mfcc_bins)
            mfccs_scaled = self.norm_mfcc(mfccs)
            mfccs_scaled = ddsp.core.resample(mfccs_scaled, n_t)
            inputs.append(mfccs_scaled)

        x = tf.concat(inputs, axis=-1)
        z = self.net(x)
        z = self.norm(z)
        z = self.dense_out(z)

        if self.pool_time:
            z = tf.reduce_mean(z, axis=1, keepdims=True)

        return z
示例#2
0
 def call(self, audio, *conditioning):
   if self.spectral_op == 'compute_mfcc':
       z = spectral_ops.compute_mfcc(
           audio,
           lo_hz=20.0,
           hi_hz=8000.0,
           fft_size=self.fft_size,
           mel_bins=128,
           mfcc_bins=30,
           overlap=self.overlap,
           pad_end=True)
   elif self.spectral_op == 'compute_logmag':
       z = spectral_ops.compute_logmag(core.tf_float32(audio), size=self.fft_size)
   
   # Normalize.
   z = self.z_norm(z[:, :, tf.newaxis, :])[:, :, 0, :]
   n_timesteps = z.shape[1]
   conditioning = [resample(c, n_timesteps) for c  in conditioning]
   
   z = tf.concat([z] + conditioning, axis=-1)
   # Run an RNN over the latents.
   z = self.rnn(z)
   # Bounce down to compressed z dimensions.
   w = tf.math.sigmoid(self.confidence(z))
   z = self.dense_out(z)
   z = tf.reduce_sum(z * w, axis=1, keepdims=True) / tf.reduce_sum(w, axis=1, keepdims=True)
   return z
示例#3
0
文件: encoders.py 项目: ketan0/ddsp
    def compute_z(self, audio):
        mfccs = []
        for fft_size, mel_bin, mfcc_bin in zip(self.fft_sizes, self.mel_bins,
                                               self.mfcc_bins):
            mfcc = spectral_ops.compute_mfcc(audio,
                                             lo_hz=20.0,
                                             hi_hz=8000.0,
                                             fft_size=fft_size,
                                             mel_bins=mel_bin,
                                             mfcc_bins=mfcc_bin)
            mfccs.append(ddsp.core.resample(mfcc, self.time_steps))

        mfccs = tf.concat(mfccs, axis=-1)

        return self.nom_out(mfccs[:, :, tf.newaxis, :])[:, :, 0, :]
示例#4
0
文件: encoders.py 项目: VasLyber/DDSP
    def compute_z(self, conditioning):
        mfccs = spectral_ops.compute_mfcc(conditioning['audio'],
                                          lo_hz=20.0,
                                          hi_hz=8000.0,
                                          fft_size=self.fft_size,
                                          mel_bins=128,
                                          mfcc_bins=30,
                                          overlap=self.overlap,
                                          pad_end=True)

        # Normalize.
        z = self.z_norm(mfccs[:, :, tf.newaxis, :])[:, :, 0, :]
        # Run an RNN over the latents.
        z = self.rnn(z)
        # Bounce down to compressed z dimensions.
        z = self.dense_out(z)
        return z
示例#5
0
文件: encoders.py 项目: ketan0/ddsp
    def compute_z(self, audio):
        mfccs = spectral_ops.compute_mfcc(audio,
                                          lo_hz=20.0,
                                          hi_hz=8000.0,
                                          fft_size=1024,
                                          mel_bins=128,
                                          mfcc_bins=30)
        z = self.norm_in(mfccs[:, :, tf.newaxis, :])[:, :, 0, :]

        if self.mean_aggregate:
            z = self.rnn(z)
            z = tf.reduce_mean(z, axis=1, keepdims=True)
        else:
            z = self.rnn(z)
            z = tf.concat(z, axis=-1)[:, tf.newaxis, :]

        # Bounce down to compressed dimensions.
        return self.dense_z(z)
示例#6
0
  def compute_z(self, audio):
    mfccs = spectral_ops.compute_mfcc(
        audio,
        sample_rate=self.sample_rate,
        lo_hz=4.0,
        hi_hz=16000.0,
        fft_size=self.fft_size,
        mel_bins=128,
        mfcc_bins=40,
        overlap=self.overlap,
        pad_end=True)

    # Normalize.
    z = self.z_norm(mfccs[:, :, tf.newaxis, :])[:, :, 0, :]
    # Run an RNN over the latents.
    z = self.rnn(z)
    # Run a tcnn over latents.
    z = self.tcnn(z)
    # Bounce down to compressed z dimensions.
    z = self.dense_out(z)
    return z