예제 #1
0
  def __init__(self,
               rnn_channels=512,
               rnn_type='gru',
               z_dims=32,
               z_time_steps=250,
               other_encoders=None,
               **kwargs):
    super().__init__(other_encoders=other_encoders, **kwargs)
    self.z_time_steps = z_time_steps

    # Layers.
    self.z_norm = nn.Normalize('instance')
    self.rnn = nn.temporal_cnn(rnn_channels, 10)
    self.dense_out = nn.dense(z_dims)
    self.frame_shape = (360, 640, 3)
    self.cv_net = tf.keras.applications.ResNet50V2(include_top=False, weights='imagenet', input_shape=self.frame_shape, pooling=None)
    #TODO(sclarke): Change this for fine tuning
    self.cv_net.trainable = True
    print('Vision network layer count: %i'%len(self.cv_net.layers))
    # for l in self.cv_net.layers:
    #   if not('block7' in l.name or 'top' in l.name):
    #     l.trainable = False
    self.final_layers = tf.keras.Sequential(layers=[
                          tf.keras.layers.GlobalAveragePooling2D(),
                        ])
예제 #2
0
파일: encoders.py 프로젝트: ketan0/ddsp
 def __init__(self, net=None, f0_residual=True, **kwargs):
     """Constructor."""
     super().__init__(**kwargs)
     self.net = net
     self.f0_residual = f0_residual
     self.dense_out = tfkl.Dense(2)
     self.norm = nn.Normalize('layer')
예제 #3
0
파일: encoders.py 프로젝트: ketan0/ddsp
    def __init__(self,
                 rnn_channels=512,
                 rnn_type='gru',
                 z_dims=512,
                 mean_aggregate=False,
                 **kwargs):
        super().__init__(**kwargs)
        self.mean_aggregate = mean_aggregate

        # Layers.
        self.norm_in = nn.Normalize('instance')
        self.rnn = nn.Rnn(rnn_channels, rnn_type)
        self.dense_z = tfkl.Dense(z_dims)
예제 #4
0
파일: encoders.py 프로젝트: ketan0/ddsp
    def __init__(self,
                 net=None,
                 z_dims=128,
                 input_keys=('f0_scaled', 'ld_scaled'),
                 mfcc_bins=60,
                 fft_size=1024,
                 mel_bins=128,
                 pool_time=True,
                 **kwargs):
        self.input_keys = input_keys
        super().__init__(input_keys, **kwargs)
        self.compute_mfccs = 'audio' in self.input_keys
        self.mfcc_bins = mfcc_bins
        self.fft_size = fft_size
        self.mel_bins = mel_bins
        self.pool_time = pool_time

        # Layers.
        self.net = net
        self.norm = nn.Normalize('layer')
        self.dense_out = tfkl.Dense(z_dims)
        if self.compute_mfccs:
            self.norm_mfcc = nn.Normalize('instance')
예제 #5
0
파일: encoders.py 프로젝트: ketan0/ddsp
    def __init__(self,
                 fft_sizes=(1024, ),
                 mel_bins=(128, ),
                 mfcc_bins=(30, ),
                 time_steps=250,
                 **kwargs):
        super().__init__(**kwargs)
        self.fft_sizes = ddsp.core.make_iterable(fft_sizes)
        self.mel_bins = ddsp.core.make_iterable(mel_bins)
        self.mfcc_bins = ddsp.core.make_iterable(mfcc_bins)
        self.time_steps = time_steps

        # Layers.
        self.norm_out = nn.Normalize('instance')
예제 #6
0
  def __init__(self,
               rnn_channels=512,
               rnn_type='gru',
               z_dims=32,
               mfcc_time_steps=250,
               z_time_steps=250,
               sample_rate=16000,
               other_encoders=None,
               tcnn_kernel=7,
               **kwargs):
    super().__init__(other_encoders=other_encoders, **kwargs)
    if mfcc_time_steps not in [63, 125, 250, 500, 1000]:
      raise ValueError(
          '`mfcc_time_steps` currently limited to 63,125,250,500 and 1000')
    self.z_audio_spec = {
        '63': {
            'fft_size': 2048,
            'overlap': 0.5
        },
        '125': {
            'fft_size': 1024,
            'overlap': 0.5
        },
        '250': {
            'fft_size': 1024,
            'overlap': 0.75
        },
        '500': {
            'fft_size': 512,
            'overlap': 0.75
        },
        '1000': {
            'fft_size': 256,
            'overlap': 0.75
        }
    }
    self.fft_size = self.z_audio_spec[str(mfcc_time_steps)]['fft_size']
    self.overlap = self.z_audio_spec[str(mfcc_time_steps)]['overlap']
    self.sample_rate = sample_rate
    if z_time_steps:
      print('Z time steps: %i'%z_time_steps)
      self.z_time_steps = z_time_steps

    # Layers.
    self.z_norm = nn.Normalize('instance')
    self.rnn = nn.Rnn(rnn_channels, rnn_type)
    self.tcnn = nn.temporal_cnn(rnn_channels, tcnn_kernel, causal=False)
    self.dense_out = tfkl.Dense(z_dims)
예제 #7
0
  def __init__(self,
               rnn_channels=512,
               rnn_type='gru',
               z_time_steps=250,
               input_keys=['discriminator_audio', 'f0_hz', 'ld_scaled'],
               spectral_op='compute_mfcc',
               **kwargs):
    # make the input key that contains audio the first
    input_keys = sorted(input_keys, key=lambda i: not 'audio' in i)
    if len(input_keys) > 1:
        assert 'audio' not in input_keys[1], "This discriminator only handles a single audio input"
    super().__init__(**kwargs, input_keys=input_keys)
    if z_time_steps not in [63, 125, 250, 500, 1000]:
      raise ValueError(
          '`z_time_steps` currently limited to 63,125,250,500 and 1000')
    self.z_audio_spec = {
        '63': {
            'fft_size': 2048,
            'overlap': 0.5
        },
        '125': {
            'fft_size': 1024,
            'overlap': 0.5
        },
        '250': {
            'fft_size': 1024,
            'overlap': 0.75
        },
        '500': {
            'fft_size': 512,
            'overlap': 0.75
        },
        '1000': {
            'fft_size': 256,
            'overlap': 0.75
        }
    }
    self.fft_size = self.z_audio_spec[str(z_time_steps)]['fft_size']
    self.spectral_op = spectral_op
    self.overlap = self.z_audio_spec[str(z_time_steps)]['overlap']

    # Layers.
    self.z_norm = nn.Normalize('layer')
    self.rnn = nn.Rnn(rnn_channels, rnn_type)
    self.dense_out = tfkl.Dense(1)
    self.confidence = tfkl.Dense(1)
예제 #8
0
    def __init__(self,
                 net=None,
                 f0_residual=True,
                 norm=True,
                 output_splits=(('f0_midi', 1), ('amplitudes', 1),
                                ('harmonic_distribution', 60), ('magnitudes',
                                                                65)),
                 **kwargs):
        """Constructor."""
        self.output_splits = output_splits
        self.n_out = sum([v[1] for v in output_splits])
        output_keys = [v[0] for v in output_splits] + ['f0_hz']
        super().__init__(output_keys=output_keys, **kwargs)

        # Layers.
        self.net = net
        self.f0_residual = f0_residual
        self.dense_out = tfkl.Dense(self.n_out)
        self.norm = nn.Normalize('layer') if norm else None
예제 #9
0
파일: encoders.py 프로젝트: noetits/ddsp
  def __init__(self,
               rnn_channels=512,
               rnn_type='gru',
               z_dims=32,
               z_time_steps=250,
               f0_encoder=None,
               name='mfcc_time_distrbuted_rnn_encoder'):
    super(MfccTimeDistributedRnnEncoder, self).__init__(
        f0_encoder=f0_encoder, name=name)
    if z_time_steps not in [63, 125, 250, 500, 1000]:
      raise ValueError(
          '`z_time_steps` currently limited to 63,125,250,500 and 1000')
    self.z_audio_spec = {
        63: {
            'fft_size': 2048,
            'overlap': 0.5
        },
        125: {
            'fft_size': 1024,
            'overlap': 0.5
        },
        250: {
            'fft_size': 1024,
            'overlap': 0.75
        },
        500: {
            'fft_size': 512,
            'overlap': 0.75
        },
        1000: {
            'fft_size': 256,
            'overlap': 0.75
        }
    }
    self.fft_size = self.z_audio_spec[z_time_steps]['fft_size']
    self.overlap = self.z_audio_spec[z_time_steps]['overlap']

    # Layers.
    self.z_norm = nn.Normalize('instance')
    self.rnn = nn.rnn(rnn_channels, rnn_type)
    self.dense_out = nn.dense(z_dims)
예제 #10
0
파일: encoders.py 프로젝트: ketan0/ddsp
    def __init__(self,
                 rnn_channels=512,
                 rnn_type='gru',
                 z_dims=32,
                 z_time_steps=250,
                 **kwargs):
        super().__init__(**kwargs)
        if z_time_steps not in [63, 125, 250, 500, 1000]:
            raise ValueError(
                '`z_time_steps` currently limited to 63,125,250,500 and 1000')
        self.z_audio_spec = {
            '63': {
                'fft_size': 2048,
                'overlap': 0.5
            },
            '125': {
                'fft_size': 1024,
                'overlap': 0.5
            },
            '250': {
                'fft_size': 1024,
                'overlap': 0.75
            },
            '500': {
                'fft_size': 512,
                'overlap': 0.75
            },
            '1000': {
                'fft_size': 256,
                'overlap': 0.75
            }
        }
        self.fft_size = self.z_audio_spec[str(z_time_steps)]['fft_size']
        self.overlap = self.z_audio_spec[str(z_time_steps)]['overlap']

        # Layers.
        self.z_norm = nn.Normalize('instance')
        self.rnn = nn.Rnn(rnn_channels, rnn_type)
        self._enc_mu_log_var = tfkl.Dense(2 * z_dims)
        self.z_time_steps = z_time_steps
        self.z_dims = z_dims