def decode(self, latent, features): # Prepare the inputs. latents_at_frame_rate = utils.upsample_to_repetitions(latent, features['segment_n_frames']) phones_at_frame_rate = utils.upsample_to_repetitions(features['phones'], features['dur']).type(torch.float) norm_counters = features['normalised_counters'] decoder_inputs = torch.cat((latents_at_frame_rate, phones_at_frame_rate, norm_counters), dim=-1) # Run the decoder. pred_norm_lf0_deltas, _ = self.decoder_layer(decoder_inputs, seq_len=features['n_frames']) # Prepare the outputs. pred_lf0_deltas = self.normalisers['lf0'].denormalise(pred_norm_lf0_deltas, deltas=True) # MLPG to select the most probable trajectory given the delta and delta-delta features. pred_lf0 = MLPG(means=pred_lf0_deltas, variances=self.normalisers['lf0'].delta_params['std_dev'] ** 2) outputs = { 'normalised_lf0_deltas': pred_norm_lf0_deltas, 'lf0_deltas': pred_lf0_deltas, 'lf0': pred_lf0 } sentence_f0 = torch.exp(features['lf0']) segment_f0 = utils.split_to_segments(sentence_f0, features['segment_n_frames']) segment_mean_f0 = torch.sum(segment_f0, dim=2) / features['segment_n_frames'].type(segment_f0.dtype) self.metrics.accumulate(self.mode, embeddings=(latent, features['n_segments']), name=[features['name']], n_segments=features['n_segments'], segment_mean_F0=(segment_mean_f0, features['n_segments'])) return outputs
def predict(self, features): # Prepare inputs. norm_lab = features['normalised_lab'] dur = features['dur'] norm_lab_at_frame_rate = utils.upsample_to_repetitions(norm_lab, dur) norm_counters = features['normalised_counters'] model_inputs = torch.cat((norm_lab_at_frame_rate, norm_counters), dim=-1) # Run the encoder. n_frames = features['n_frames'] pred_norm_lf0_deltas = self.recurrent_layers(model_inputs, seq_len=n_frames) # Prepare the outputs. pred_lf0_deltas = self.normalisers['lf0'].denormalise(pred_norm_lf0_deltas, deltas=True) # MLPG to select the most probable trajectory given the delta and delta-delta features. pred_lf0 = MLPG(means=pred_lf0_deltas, variances=self.normalisers['lf0'].delta_params['std_dev'] ** 2) outputs = { 'normalised_lf0_deltas': pred_norm_lf0_deltas, 'lf0_deltas': pred_lf0_deltas, 'lf0': pred_lf0 } return outputs
def decode(self, latent, features): # Prepare the inputs. n_frames = features['n_frames'] max_n_frames = torch.max(n_frames) latents_at_frame_rate = latent.unsqueeze(1).repeat(1, max_n_frames, 1) norm_lab = features['normalised_lab'] dur = features['dur'] norm_lab_at_frame_rate = utils.upsample_to_repetitions(norm_lab, dur) norm_counters = features['normalised_counters'] decoder_inputs = torch.cat( (latents_at_frame_rate, norm_lab_at_frame_rate, norm_counters), dim=-1) # Run the decoder. pred_norm_lf0_deltas = self.decoder_layer(decoder_inputs, seq_len=n_frames) # Prepare the outputs. pred_lf0_deltas = self.normalisers['lf0'].denormalise( pred_norm_lf0_deltas, deltas=True) # MLPG to select the most probable trajectory given the delta and delta-delta features. pred_lf0 = MLPG( means=pred_lf0_deltas, variances=self.normalisers['lf0'].delta_params['std_dev']**2) outputs = { 'normalised_lf0_deltas': pred_norm_lf0_deltas, 'lf0_deltas': pred_lf0_deltas, 'lf0': pred_lf0 } return outputs
def predict(self, features): # Prepare inputs. norm_lab_at_frame_rate = utils.upsample_to_repetitions( features['normalised_lab'], features['dur']) model_inputs = torch.cat( (norm_lab_at_frame_rate, features['normalised_counters']), dim=-1) n_frames = features['n_frames'] # Run the encoder. pred_norm_lf0_deltas = self.recurrent_layers(model_inputs, seq_len=n_frames) # Prepare the outputs. pred_lf0_deltas = self.normalisers['lf0'].denormalise( pred_norm_lf0_deltas, deltas=True) # MLPG to select the most probable trajectory given the delta and delta-delta features. global_variance = self.normalisers['lf0'].delta_params['std_dev']**2 pred_lf0 = viz.synthesis.MLPG(pred_lf0_deltas, global_variance, padding_size=100, seq_len=n_frames) outputs = {'lf0_deltas': pred_lf0_deltas, 'lf0': pred_lf0} return outputs
def predict(self, features): # Prepare inputs. norm_lab = features['normalised_lab'] dur = features['dur'] norm_lab_at_frame_rate = utils.upsample_to_repetitions(norm_lab, dur) norm_counters = features['normalised_counters'] model_inputs = torch.cat((norm_lab_at_frame_rate, norm_counters), dim=-1) # Run the encoder. n_frames = features['n_frames'] predicted = self.recurrent_layers(model_inputs, seq_len=n_frames) # Extract the mixing components, means, and standard deviations from the prediction. i, j = self.n_components, self.n_components * self.output_dim pi = predicted[..., :i] means = torch.split(predicted[..., i:i + j], self.output_dim, dim=-1) log_variances = torch.split(predicted[..., i + j:], self.output_dim, dim=-1) # Set a variance floor. log_variances = [ torch.clamp(log_variance, min=np.log(self.var_floor)) for log_variance in log_variances ] # Reparameterisation: mixing coefficients should be a distribution, standard deviation should be non-negative. pi = F.softmax(pi, dim=-1) std_devs = [ torch.exp(log_variance * 0.5) for log_variance in log_variances ] # Prepare the outputs. pred_norm_lf0_deltas_GMM = GaussianMixtureModel(pi, means, std_devs) # Take the most likely components and find the most probable trajectory using MLPG. pred_norm_lf0_deltas_mean, pred_norm_lf0_deltas_std_dev = pred_norm_lf0_deltas_GMM.argmax_components( ) pred_norm_lf0_deltas_var = pred_norm_lf0_deltas_std_dev**2 pred_norm_lf0 = MLPG(means=pred_norm_lf0_deltas_mean, variances=pred_norm_lf0_deltas_var) pred_lf0 = self.normalisers['lf0'].denormalise(pred_norm_lf0) outputs = { 'normalised_lf0_deltas_GMM': pred_norm_lf0_deltas_GMM, 'lf0': pred_lf0 } return outputs
def encode(self, features): # Prepare inputs. norm_lab = features['normalised_lab'] dur = features['dur'] norm_lab_at_frame_rate = utils.upsample_to_repetitions(norm_lab, dur) norm_lf0_deltas = features['normalised_lf0_deltas'] norm_counters = features['normalised_counters'] encoder_inputs = torch.cat((norm_lf0_deltas, norm_lab_at_frame_rate, norm_counters), dim=-1) # Run the encoder. n_frames = features['n_frames'] mean, log_variance = self.encoder_layer(encoder_inputs, seq_len=n_frames) return mean, log_variance
def predict(self, features): # Prepare inputs. norm_lab = features['normalised_lab'] dur = features['dur'] norm_lab_at_frame_rate = utils.upsample_to_repetitions(norm_lab, dur) norm_counters = features['normalised_counters'] model_inputs = torch.cat((norm_lab_at_frame_rate, norm_counters), dim=-1) # Run the model. n_frames = features['n_frames'] pred_norm_deltas = self.layers(model_inputs, seq_len=n_frames) # Prepare the outputs. output_dims = [ self.output_dims[n] for n in ['lf0', 'vuv', 'mcep', 'bap'] ] pred_norm_lf0_deltas, pred_vuv, pred_norm_mcep_deltas, pred_norm_bap_deltas = \ torch.split(pred_norm_deltas, output_dims, dim=-1) pred_lf0 = self._prepare_output('lf0', pred_norm_lf0_deltas) pred_mcep = self._prepare_output('mcep', pred_norm_mcep_deltas) pred_bap = self._prepare_output('bap', pred_norm_bap_deltas) pred_vuv = torch.sigmoid(pred_vuv) outputs = { 'normalised_lf0_deltas': pred_norm_lf0_deltas, 'normalised_mcep_deltas': pred_norm_mcep_deltas, 'normalised_bap_deltas': pred_norm_bap_deltas, 'lf0': pred_lf0, 'vuv': pred_vuv, 'mcep': pred_mcep, 'bap': pred_bap, } return outputs