def generate(self, synth, h_0, f0_hz, enc_frame_setting='fine', n_samples=16000): """ synth: synth to generate audio h_0: initial state of RNN [batch, latent_dims] f0_hz: f0 conditioning of synth [batch, f0_n_frames, 1] enc_frame_setting: fft/hop size n_samples: output audio length in samples """ h = h_0 n_fft, hop_length = get_window_hop(enc_frame_setting) n_frames = math.ceil((n_samples - n_fft) / hop_length) + 1 f0_hz = resample_frames(f0_hz, n_frames) # needs to have same dimension as z params_list = [] z = torch.zeros(h_0.shape[0], n_frames, self.latent_dims).to(h.device) for t in range(n_frames): # prior distribution with rnn information mu_p_t, scale_p_t = self.get_prior(h) prior_t = Independent(Normal(mu_p_t, scale_p_t), 1) prior_sample_t = prior_t.rsample() h = self.temporal(prior_sample_t, h) z[:, t, :] = prior_sample_t cond = {} cond['z'] = z cond['f0_hz'] = f0_hz y_params = self.decode(cond) params = synth.fill_params(y_params, cond) resyn_audio, outputs = synth(params, n_samples) return params, resyn_audio
def generate(self, synth, h_0, f0_hz, attributes, enc_frame_setting='fine', n_samples=16000): """ synth: synth to generate audio h_0: initial seed of RNN [batch, latent_dims] f0_hz: f0 conditioning of synth [batch, f0_n_frames, 1] attributes: attributes [batch, n_frames, attribute_size] enc_frame_setting: fft/hop size n_samples: output audio length in samples """ if len(h_0.shape) == 2: h = h_0[None, :, :] # 1, batch, latent_dims else: h = h_0 n_fft, hop_length = get_window_hop(enc_frame_setting) n_frames = math.ceil((n_samples - n_fft) / hop_length) + 1 f0_hz = resample_frames(f0_hz, n_frames) # needs to have same dimension as z params_list = [] for i in range(n_frames): cond = {} output = torch.cat([h.permute(1, 0, 2), attributes], dim=-1) mu, logscale = self.psi_p(output, output) scale = logscale.exp() prior = Independent(Normal(mu, scale), 1) prior_sample = prior.rsample() cond['z'] = prior_sample cond['f0_hz'] = f0_hz[:, i, :].unsqueeze(1) cond['f0_scaled'] = hz_to_midi(cond['f0_hz']) / 127.0 # generate x y = self.decode(cond) params = synth.fill_params(y, cond) params_list.append(params) x_tilde, _outputs = synth( params, n_samples=n_fft) # write exactly one frame cond['audio'] = x_tilde # encode cond = self.encoder(cond) z_enc = cond['z'] # get psi_q mu, logscale = self.psi_q(z_enc, z_enc) psi = torch.cat([mu, logscale], dim=-1) # temporal model temp_q, h = self.temporal_q(psi, h) # one off param_names = params_list[0].keys() final_params = {} for pn in param_names: #cat over frames final_params[pn] = torch.cat([par[pn] for par in params_list], dim=1) final_audio, _outputs = synth(final_params, n_samples=n_samples) return final_params, final_audio
def expand(self, cond, time_steps): """Make sure some conditioning has same temporal resolution as other conditioning.""" # Add time dim of z if necessary. if len(cond.shape) == 2: cond = cond[:, None, :] # Expand time dim of cond if necessary. cond_time_steps = int(cond.shape[1]) if cond_time_steps != time_steps: cond = resample_frames(cond, time_steps) return cond
def generate(self, synth, h_0, f0_hz, enc_frame_setting='fine', n_samples=16000): """ synth: synth to generate audio h_0: initial state of RNN [batch, latent_dims] f0_hz: f0 conditioning of synth [batch, f0_n_frames, 1] enc_frame_setting: fft/hop size n_samples: output audio length in samples """ h = h_0 n_fft, hop_length = get_window_hop(enc_frame_setting) n_frames = math.ceil((n_samples - n_fft) / hop_length) + 1 f0_hz = resample_frames(f0_hz, n_frames) # needs to have same dimension as z params_list = [] z = torch.zeros(h_0.shape[0], n_frames, self.latent_dims).to(h.device) for t in range(n_frames): h_mu, h_scale = self.h_process(h, h) mu_t, logscale_t = self.psi_p(h_mu, h_scale) # [batch, latent_size] scale_t = logscale_t.exp() prior_t = Independent(Normal(mu_t, scale_t), 1) prior_sample_t = prior_t.rsample() cond = {} z[:, t, :] = prior_sample_t cond['z'] = prior_sample_t.unsqueeze(1) cond['f0_hz'] = f0_hz[:, t, :].unsqueeze(1) cond['f0_scaled'] = hz_to_midi(cond['f0_hz']) / 127.0 # generate x y = self.decode(cond) params = synth.fill_params(y, cond) params_list.append(params) x_tilde, _outputs = synth( params, n_samples=n_fft) # write exactly one frame cond['audio'] = x_tilde # encode cond = self.encoder(cond) z_enc = cond['z'].squeeze(1) # get psi_q mu, logscale = self.psi_q(z_enc, z_enc) rnn_input = torch.cat([mu, logscale, prior_sample_t], dim=-1) # temporal model h = self.temporal_q(rnn_input, h) # one off cond = {} cond['z'] = z cond['f0_hz'] = f0_hz y_params = self.decode(cond) params = synth.fill_params(y_params, cond) resyn_audio, outputs = synth(params, n_samples) return params, resyn_audio
def forward(self, amplitudes, frequencies, n_samples=None): """Synthesize audio with sinusoid oscillators Args: amplitudes: Amplitude tensor of shape [batch, n_frames, n_sinusoids]. frequencies: Tensor of shape [batch, n_frames, n_sinusoids]. Returns: signal: A tensor of harmonic waves of shape [batch, n_samples]. """ if n_samples is None: n_samples = self.n_samples # Scale the amplitudes. if self.amp_scale_fn is not None: amplitudes = self.amp_scale_fn(amplitudes) if self.freq_scale_fn is not None: frequencies = self.freq_scale_fn(frequencies) # resample to n_samples amplitudes_envelope = util.resample_frames(amplitudes, n_samples) frequency_envelope = util.resample_frames(frequencies, n_samples) signal = util.oscillator_bank(frequency_envelope, amplitudes_envelope, self.sample_rate) return signal
def generate(self, synth, h_0, f0_hz, attributes, enc_frame_setting='fine', n_samples=16000): """ synth: synth to generate audio h_0: initial state of RNN [batch, latent_dims] f0_hz: f0 conditioning of synth [batch, f0_n_frames, 1] attributes: attributes [batch, attribute_size] or [batch, n_frames, attribute_size] enc_frame_setting: fft/hop size n_samples: output audio length in samples """ n_fft, hop_length = get_window_hop(enc_frame_setting) n_frames = math.ceil((n_samples - n_fft) / hop_length) + 1 f0_hz = resample_frames(f0_hz, n_frames).to( h_0.device) # needs to have same dimension as z params_list = [] z = torch.zeros(h_0.shape[0], n_frames, self.latent_dims).to(h_0.device) if len(attributes.shape) == 2: attributes = attributes[:, None, :].expand(-1, n_frames, -1) # set up initial prior with attributes z_t = torch.zeros(h_0.shape[0], self.latent_dims).to(h_0.device) rnn_input = torch.cat([z_t, attributes[:, 0, :]], dim=-1) h = self.temporal(rnn_input, h_0) for t in range(n_frames): # prior distribution with rnn information mu_p_t, scale_p_t = self.get_prior(h) prior_t = Independent(Normal(mu_p_t, scale_p_t), 1) z_t = prior_t.rsample() rnn_input = torch.cat([z_t, attributes[:, t, :]], dim=-1) h = self.temporal(rnn_input, h) z[:, t, :] = z_t cond = {} cond['z'] = z cond['f0_hz'] = f0_hz y_params = self.decode(cond) params = synth.fill_params(y_params, cond) resyn_audio, outputs = synth(params, n_samples) return params, resyn_audio