def forward(self, audio, mel, audio_start, clip_kl=True): """Compute loss of Clarinet model. Args: audio (Variable): shape(B, T_audio), dtype flaot32, ground truth waveform. mel (Variable): shape(B, F, T_mel), dtype flaot32, condition(mel spectrogram here). audio_start (Variable): shape(B, ), dtype int64, audio starts positions. clip_kl (bool, optional): whether to clip kl_loss by maximum=100. Defaults to True. Returns: Dict(str, Variable) loss (Variable): shape(1, ), dtype flaot32, total loss. kl (Variable): shape(1, ), dtype flaot32, kl divergence between the teacher's output distribution and student's output distribution. regularization (Variable): shape(1, ), dtype flaot32, a regularization term of the KL divergence. spectrogram_frame_loss (Variable): shape(1, ), dytpe: float, stft loss, the L1-distance of the magnitudes of the spectrograms of the ground truth waveform and synthesized waveform. """ batch_size, audio_length = audio.shape # audio clip's length z = F.gaussian_random(audio.shape) condition = self.encoder(mel) # (B, C, T) condition_slice = crop(condition, audio_start, audio_length) x, s_means, s_scales = self.student(z, condition_slice) # all [0: T] s_means = s_means[:, 1:] # (B, T-1), time steps [1: T] s_scales = s_scales[:, 1:] # (B, T-1), time steps [1: T] s_clipped_scales = F.clip(s_scales, self.min_log_scale, 100.) # teacher outputs single gaussian y = self.teacher(x[:, :-1], condition_slice[:, :, 1:]) _, t_means, t_scales = F.split(y, 3, -1) # time steps [1: T] t_means = F.squeeze(t_means, [-1]) # (B, T-1), time steps [1: T] t_scales = F.squeeze(t_scales, [-1]) # (B, T-1), time steps [1: T] t_clipped_scales = F.clip(t_scales, self.min_log_scale, 100.) s_distribution = D.Normal(s_means, F.exp(s_clipped_scales)) t_distribution = D.Normal(t_means, F.exp(t_clipped_scales)) # kl divergence loss, so we only need to sample once? no MC kl = s_distribution.kl_divergence(t_distribution) if clip_kl: kl = F.clip(kl, -100., 10.) # context size dropped kl = F.reduce_mean(kl[:, self.teacher.context_size:]) # major diff here regularization = F.mse_loss(t_scales[:, self.teacher.context_size:], s_scales[:, self.teacher.context_size:]) # introduce information from real target spectrogram_frame_loss = F.mse_loss(self.stft.magnitude(audio), self.stft.magnitude(x)) loss = kl + self.lmd * regularization + spectrogram_frame_loss loss_dict = { "loss": loss, "kl_divergence": kl, "regularization": regularization, "stft_loss": spectrogram_frame_loss } return loss_dict
def synthesis(self, mel): """Synthesize waveform using the encoder and the student network. Args: mel (Variable): shape(B, F, T_mel), the condition(mel spectrogram here). Returns: Variable: shape(B, T_audio), the synthesized waveform. (T_audio = T_mel * upscale_factor, where upscale_factor is the `upscale_factor` of the encoder.) """ condition = self.encoder(mel) samples_shape = (condition.shape[0], condition.shape[-1]) z = F.gaussian_random(samples_shape) x, s_means, s_scales = self.student(z, condition) return x
def init_projection_matrix(self, x): # Set if using projection matrix self.params.use_projection_matrix = getattr(self.params, 'use_projection_matrix', True) if self.params.use_projection_matrix: self.compressed_dim = self.fparams.attribute( 'compressed_dim', None) proj_init_method = getattr(self.params, 'proj_init_method', 'pca') if proj_init_method == 'pca': raise NotImplementedError elif proj_init_method == 'randn': with fluid.dygraph.guard(): self.projection_matrix = TensorList([ None if cdim is None else layers.gaussian_random( (cdim, ex.shape[1], 1, 1), 0.0, 1 / math.sqrt(ex.shape[1])).numpy() for ex, cdim in zip(x, self.compressed_dim) ]) elif proj_init_method == 'np_randn': rng = np.random.RandomState(0) self.projection_matrix = TensorList([ None if cdim is None else rng.normal( size=(cdim, ex.shape[1], 1, 1), loc=0.0, scale=1 / math.sqrt(ex.shape[1])).astype('float32') for ex, cdim in zip(x, self.compressed_dim) ]) elif proj_init_method == 'ones': self.projection_matrix = TensorList([ None if cdim is None else np.ones( (cdim, ex.shape[1], 1, 1), 'float32') / math.sqrt(ex.shape[1]) for ex, cdim in zip(x, self.compressed_dim) ]) else: self.compressed_dim = x.size(1) self.projection_matrix = TensorList([None] * len(x))
def test_gaussian_random(self): program = Program() with program_guard(program): out = layers.gaussian_random(shape=[20, 30]) self.assertIsNotNone(out) print(str(program))
def init_optimization(self, train_x, init_y): # Initialize filter filter_init_method = getattr(self.params, 'filter_init_method', 'zeros') self.filter = TensorList([ np.zeros([1, cdim, sz[0], sz[1]], 'float32') for x, cdim, sz in zip(train_x, self.compressed_dim, self.kernel_size) ]) if filter_init_method == 'zeros': pass elif filter_init_method == 'ones': for idx, f in enumerate(self.filter): self.filter[idx] = np.ones(f.shape, 'float32') / np.prod( f.shape) elif filter_init_method == 'np_randn': rng = np.random.RandomState(0) for idx, f in enumerate(self.filter): self.filter[idx] = rng.normal( size=f.shape, loc=0, scale=1 / np.prod(f.shape)).astype('float32') elif filter_init_method == 'randn': for idx, f in enumerate(self.filter): with fluid.dygraph.guard(): self.filter[idx] = layers.gaussian_random( f.shape, std=1 / np.prod(f.shape)).numpy() else: raise ValueError('Unknown "filter_init_method"') # Get parameters self.params.update_projection_matrix = getattr( self.params, 'update_projection_matrix', True) and self.params.use_projection_matrix optimizer = getattr(self.params, 'optimizer', 'GaussNewtonCG') # Setup factorized joint optimization if self.params.update_projection_matrix: self.joint_problem = FactorizedConvProblem( self.init_training_samples, init_y, self.filter_reg, self.fparams.attribute('projection_reg'), self.params, self.init_sample_weights, self.projection_activation, self.response_activation) # Variable containing both filter and projection matrix joint_var = self.filter.concat(self.projection_matrix) # Initialize optimizer analyze_convergence = getattr(self.params, 'analyze_convergence', False) if optimizer == 'GaussNewtonCG': self.joint_optimizer = GaussNewtonCG( self.joint_problem, joint_var, plotting=(self.params.debug >= 3), analyze=True, fig_num=(12, 13, 14)) elif optimizer == 'GradientDescentL2': self.joint_optimizer = GradientDescentL2( self.joint_problem, joint_var, self.params.optimizer_step_length, self.params.optimizer_momentum, plotting=(self.params.debug >= 3), debug=analyze_convergence, fig_num=(12, 13)) # Do joint optimization if isinstance(self.params.init_CG_iter, (list, tuple)): self.joint_optimizer.run(self.params.init_CG_iter) else: self.joint_optimizer.run( self.params.init_CG_iter // self.params.init_GN_iter, self.params.init_GN_iter) # Get back filter and optimizer len_x = len(self.joint_optimizer.x) self.filter = self.joint_optimizer.x[:len_x // 2] # w2 in paper self.projection_matrix = self.joint_optimizer.x[len_x // 2:] # w1 in paper if analyze_convergence: opt_name = 'CG' if getattr(self.params, 'CG_optimizer', True) else 'GD' for val_name, values in zip(['loss', 'gradient'], [ self.joint_optimizer.losses, self.joint_optimizer.gradient_mags ]): val_str = ' '.join( ['{:.8e}'.format(v.item()) for v in values]) file_name = '{}_{}.txt'.format(opt_name, val_name) with open(file_name, 'a') as f: f.write(val_str + '\n') raise RuntimeError('Exiting') # Re-project samples with the new projection matrix compressed_samples = self.project_sample(self.init_training_samples, self.projection_matrix) for train_samp, init_samp in zip(self.training_samples, compressed_samples): for idx in range(init_samp.shape[0]): train_samp[idx] = init_samp[idx] self.hinge_mask = None # Initialize optimizer self.conv_problem = ConvProblem(self.training_samples, self.y, self.filter_reg, self.sample_weights, self.response_activation) if optimizer == 'GaussNewtonCG': self.filter_optimizer = ConjugateGradient( self.conv_problem, self.filter, fletcher_reeves=self.params.fletcher_reeves, direction_forget_factor=self.params.direction_forget_factor, debug=(self.params.debug >= 3), fig_num=(12, 13)) elif optimizer == 'GradientDescentL2': self.filter_optimizer = GradientDescentL2( self.conv_problem, self.filter, self.params.optimizer_step_length, self.params.optimizer_momentum, debug=(self.params.debug >= 3), fig_num=12) # Transfer losses from previous optimization if self.params.update_projection_matrix: self.filter_optimizer.residuals = self.joint_optimizer.residuals self.filter_optimizer.losses = self.joint_optimizer.losses if not self.params.update_projection_matrix: self.filter_optimizer.run(self.params.init_CG_iter) # Post optimization self.filter_optimizer.run(self.params.post_init_CG_iter) self.filter = self.filter_optimizer.x # Free memory del self.init_training_samples if self.params.use_projection_matrix: del self.joint_problem, self.joint_optimizer
def point_network_decoder(p_vec, q_vec, decoder_size): random_attn = layers.gaussian_random(shape=[1, decoder_size]) random_attn = layers.sequence_expand(x=random_attn, y=q_vec) random_attn = layers.fc(input=random_attn, size=decoder_size, act=None) U = layers.fc(input=q_vec, size=decoder_size, act=None) + random_attn U = layers.tanh(U) logits = layers.fc(input=U, size=1, act=None) scores = layers.sequence_softmax(input=logits) pooled_vec = layers.elementwise_mul(x=q_vec, y=scores, axis=0) pooled_vec = layers.sequence_pool(input=pooled_vec, pool_type='sum') init_state = layers.fc(input=pooled_vec, size=decoder_size, act=None) def custom_dynamic_rnn(p_vec, init_state, decoder_size): context = layers.fc(input=p_vec, size=decoder_size, act=None) drnn = layers.DynamicRNN() with drnn.block(): H_s = drnn.step_input(p_vec) ctx = drnn.static_input(context) c_prev = drnn.memory(init=init_state, need_reorder=True) m_prev = drnn.memory(init=init_state, need_reorder=True) m_prev1 = layers.fc(input=m_prev, size=decoder_size, act=None) m_prev1 = layers.sequence_expand(x=m_prev1, y=ctx) Fk = ctx + m_prev1 Fk = layers.fc(input=Fk, size=decoder_size, act='tanh') logits = layers.fc(input=Fk, size=1, act=None) scores = layers.sequence_softmax(input=logits) attn_ctx = layers.elementwise_mul(x=ctx, y=scores, axis=0) attn_ctx = layers.sequence_pool(input=attn_ctx, pool_type='sum') hidden_t, cell_t = lstm_step(attn_ctx, hidden_t_prev=m_prev1, cell_t_prev=c_prev, size=decoder_size) drnn.update_memory(ex_mem=m_prev, new_mem=hidden_t) drnn.update_memory(ex_mem=c_prev, new_mem=cell_t) drnn.output(scores) beta = drnn() return beta fw_outputs = custom_dynamic_rnn(p_vec, init_state, decoder_size) bw_outputs = custom_dynamic_rnn(p_vec, init_state, decoder_size) def sequence_slice(x, index): #offset = layers.fill_constant(shape=[1, args.batch_size], value=index, dtype='float32') #length = layers.fill_constant(shape=[1, args.batch_size], value=1, dtype='float32') #return layers.sequence_slice(x, offset, length) idx = layers.fill_constant(shape=[1], value=1, dtype='int32') idx.stop_gradient = True from paddle.fluid.layers.control_flow import lod_rank_table from paddle.fluid.layers.control_flow import lod_tensor_to_array from paddle.fluid.layers.control_flow import array_read from paddle.fluid.layers.control_flow import array_to_lod_tensor table = lod_rank_table(x, level=0) table.stop_gradient = True array = lod_tensor_to_array(x, table) slice_array = array_read(array=array, i=idx) return array_to_lod_tensor(slice_array, table) start_prob = layers.elementwise_mul(x=sequence_slice(fw_outputs, 0), y=sequence_slice(bw_outputs, 1), axis=0) / 2 end_prob = layers.elementwise_mul(x=sequence_slice(fw_outputs, 1), y=sequence_slice(bw_outputs, 0), axis=0) / 2 return start_prob, end_prob