def forward(self, audio, mel, audio_start, clip_kl=True): """Compute loss of Clarinet model. Args: audio (Variable): shape(B, T_audio), dtype flaot32, ground truth waveform. mel (Variable): shape(B, F, T_mel), dtype flaot32, condition(mel spectrogram here). audio_start (Variable): shape(B, ), dtype int64, audio starts positions. clip_kl (bool, optional): whether to clip kl_loss by maximum=100. Defaults to True. Returns: Dict(str, Variable) loss (Variable): shape(1, ), dtype flaot32, total loss. kl (Variable): shape(1, ), dtype flaot32, kl divergence between the teacher's output distribution and student's output distribution. regularization (Variable): shape(1, ), dtype flaot32, a regularization term of the KL divergence. spectrogram_frame_loss (Variable): shape(1, ), dytpe: float, stft loss, the L1-distance of the magnitudes of the spectrograms of the ground truth waveform and synthesized waveform. """ batch_size, audio_length = audio.shape # audio clip's length z = F.gaussian_random(audio.shape) condition = self.encoder(mel) # (B, C, T) condition_slice = crop(condition, audio_start, audio_length) x, s_means, s_scales = self.student(z, condition_slice) # all [0: T] s_means = s_means[:, 1:] # (B, T-1), time steps [1: T] s_scales = s_scales[:, 1:] # (B, T-1), time steps [1: T] s_clipped_scales = F.clip(s_scales, self.min_log_scale, 100.) # teacher outputs single gaussian y = self.teacher(x[:, :-1], condition_slice[:, :, 1:]) _, t_means, t_scales = F.split(y, 3, -1) # time steps [1: T] t_means = F.squeeze(t_means, [-1]) # (B, T-1), time steps [1: T] t_scales = F.squeeze(t_scales, [-1]) # (B, T-1), time steps [1: T] t_clipped_scales = F.clip(t_scales, self.min_log_scale, 100.) s_distribution = D.Normal(s_means, F.exp(s_clipped_scales)) t_distribution = D.Normal(t_means, F.exp(t_clipped_scales)) # kl divergence loss, so we only need to sample once? no MC kl = s_distribution.kl_divergence(t_distribution) if clip_kl: kl = F.clip(kl, -100., 10.) # context size dropped kl = F.reduce_mean(kl[:, self.teacher.context_size:]) # major diff here regularization = F.mse_loss(t_scales[:, self.teacher.context_size:], s_scales[:, self.teacher.context_size:]) # introduce information from real target spectrogram_frame_loss = F.mse_loss(self.stft.magnitude(audio), self.stft.magnitude(x)) loss = kl + self.lmd * regularization + spectrogram_frame_loss loss_dict = { "loss": loss, "kl_divergence": kl, "regularization": regularization, "stft_loss": spectrogram_frame_loss } return loss_dict
def test_mse_loss(self): input_val = np.random.uniform(0.1, 0.5, (2, 3)).astype("float32") label_val = np.random.uniform(0.1, 0.5, (2, 3)).astype("float32") sub = input_val - label_val np_result = np.mean(sub * sub) input_var = layers.create_tensor(dtype="float32", name="input") label_var = layers.create_tensor(dtype="float32", name="label") layers.assign(input=input_val, output=input_var) layers.assign(input=label_val, output=label_var) output = layers.mse_loss(input=input_var, label=label_var) for use_cuda in ([False, True] if core.is_compiled_with_cuda() else [False]): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = Executor(place) result = exe.run(fluid.default_main_program(), feed={ "input": input_var, "label": label_var }, fetch_list=[output]) self.assertTrue(np.isclose(np_result, result).all())
def forward(self): """forward""" dist_feat_order = self.edges_dist dist_feat = self.e2n_gw.edge_feat['dist'] dist_feat, dist_feat_order = layers.spatial_embedding( dist_feat, dist_feat_order, self.hidden_size) node_edge_feat = self.e2n_gw.node_feat["attr"] feat_size = node_edge_feat.shape[-1] for i in range(self.num_layers): out_size = self.hidden_size if i == self.num_layers + 1 else feat_size feat_h = layers.SpatialConv(self.e2n_gw, self.e2e_gw, self.srcs, self.dsts, node_edge_feat, dist_feat_order, dist_feat, self.nids, self.eids, self.node_lod, self.edge_lod, out_size, name="layer_%s" % (i)) node_edge_feat = feat_h node_feat = fl.gather(node_edge_feat, self.nids) pooled_h = layers.graph_pooling(node_feat, self.node_lod, self.pool_type) output = fl.fc(pooled_h, size=self.hidden_size * 4, act='relu') output = fl.dropout(output, self.dropout_prob, dropout_implementation="upscale_in_train") output = fl.fc(output, size=self.hidden_size * 2, act='relu') output = fl.dropout(output, self.dropout_prob, dropout_implementation="upscale_in_train") output = fl.fc(output, size=self.hidden_size * 1, act='relu') output = fl.dropout(output, self.dropout_prob, dropout_implementation="upscale_in_train") self.output = fl.fc(output, size=self.n_output, act=None) # calculate loss self.loss = fl.mse_loss(self.output, self.pk) self.loss = fl.reduce_mean(self.loss)
kept_layers_index.append( math.floor(i / depth_mult) - 1) if mode == 'classification': logit_loss = soft_cross_entropy( student_logit, teacher_logit.detach()) else: logit_loss = 0.0 ### hidden_states distillation loss rep_loss = 0.0 for stu_rep, tea_rep in zip( student_reps, list(teacher_reps[i] for i in kept_layers_index)): tmp_loss = L.mse_loss(stu_rep, tea_rep.detach()) rep_loss += tmp_loss loss = args.width_lambda1 * logit_loss + args.width_lambda2 * rep_loss else: ### logit distillation loss if mode == 'classification': logit_loss = soft_cross_entropy( student_logit, teacher_logit.detach()) else: logit_loss = 0.0 ### hidden_states distillation loss rep_loss = 0.0 for stu_rep, tea_rep in zip(
def main(args): local_rank = dg.parallel.Env().local_rank nranks = dg.parallel.Env().nranks parallel = nranks > 1 with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = fluid.CUDAPlace(dg.parallel.Env() .dev_id) if args.use_gpu else fluid.CPUPlace() fluid.enable_dygraph(place) if not os.path.exists(args.output): os.mkdir(args.output) writer = SummaryWriter(os.path.join(args.output, 'log')) if local_rank == 0 else None model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels']) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay(1 / (cfg['train']['warm_up_step'] * (cfg['train']['learning_rate']**2)), cfg['train']['warm_up_step']), parameter_list=model.parameters(), grad_clip=fluid.clip.GradientClipByGlobalNorm(cfg['train'][ 'grad_clip_thresh'])) reader = LJSpeechLoader( cfg['audio'], place, args.data, args.alignments_path, cfg['train']['batch_size'], nranks, local_rank, shuffle=True).reader iterator = iter(tqdm(reader)) # Load parameters. global_step = io.load_parameters( model=model, optimizer=optimizer, checkpoint_dir=os.path.join(args.output, 'checkpoints'), iteration=args.iteration, checkpoint_path=args.checkpoint) print("Rank {}: checkpoint loaded.".format(local_rank)) if parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) while global_step <= cfg['train']['max_iteration']: try: batch = next(iterator) except StopIteration as e: iterator = iter(tqdm(reader)) batch = next(iterator) (character, mel, pos_text, pos_mel, alignment) = batch global_step += 1 #Forward result = model( character, pos_text, mel_pos=pos_mel, length_target=alignment) mel_output, mel_output_postnet, duration_predictor_output, _, _ = result mel_loss = layers.mse_loss(mel_output, mel) mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel) duration_loss = layers.mean( layers.abs( layers.elementwise_sub(duration_predictor_output, alignment))) total_loss = mel_loss + mel_postnet_loss + duration_loss if local_rank == 0: writer.add_scalar('mel_loss', mel_loss.numpy(), global_step) writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step) writer.add_scalar('duration_loss', duration_loss.numpy(), global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) if parallel: total_loss = model.scale_loss(total_loss) total_loss.backward() model.apply_collective_grads() else: total_loss.backward() optimizer.minimize(total_loss) model.clear_gradients() # save checkpoint if local_rank == 0 and global_step % cfg['train'][ 'checkpoint_interval'] == 0: io.save_parameters( os.path.join(args.output, 'checkpoints'), global_step, model, optimizer) if local_rank == 0: writer.close()
def main(args): local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0 nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1 with open(args.config_path) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = (fluid.CUDAPlace(dg.parallel.Env().dev_id) if args.use_data_parallel else fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()) if not os.path.exists(args.log_dir): os.mkdir(args.log_dir) path = os.path.join(args.log_dir, 'fastspeech') writer = SummaryWriter(path) if local_rank == 0 else None with dg.guard(place): with fluid.unique_name.guard(): transformer_tts = TransformerTTS(cfg) model_dict, _ = load_checkpoint( str(args.transformer_step), os.path.join(args.transtts_path, "transformer")) transformer_tts.set_dict(model_dict) transformer_tts.eval() model = FastSpeech(cfg) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay(1 / ( cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']), parameter_list=model.parameters()) reader = LJSpeechLoader( cfg, args, nranks, local_rank, shuffle=True).reader() if args.checkpoint_path is not None: model_dict, opti_dict = load_checkpoint( str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech")) model.set_dict(model_dict) optimizer.set_dict(opti_dict) global_step = args.fastspeech_step print("load checkpoint!!!") if args.use_data_parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) for epoch in range(args.epochs): pbar = tqdm(reader) for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d' % epoch) (character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens, enc_slf_mask, enc_query_mask, dec_slf_mask, enc_dec_mask, dec_query_slf_mask, dec_query_mask) = data _, _, attn_probs, _, _, _ = transformer_tts( character, mel_input, pos_text, pos_mel, dec_slf_mask=dec_slf_mask, enc_slf_mask=enc_slf_mask, enc_query_mask=enc_query_mask, enc_dec_mask=enc_dec_mask, dec_query_slf_mask=dec_query_slf_mask, dec_query_mask=dec_query_mask) alignment, max_attn = get_alignment(attn_probs, mel_lens, cfg['transformer_head']) alignment = dg.to_variable(alignment).astype(np.float32) if local_rank == 0 and global_step % 5 == 1: x = np.uint8( cm.viridis(max_attn[8, :mel_lens.numpy()[8]]) * 255) writer.add_image( 'Attention_%d_0' % global_step, x, 0, dataformats="HWC") global_step += 1 #Forward result = model( character, pos_text, mel_pos=pos_mel, length_target=alignment, enc_non_pad_mask=enc_query_mask, enc_slf_attn_mask=enc_slf_mask, dec_non_pad_mask=dec_query_slf_mask, dec_slf_attn_mask=dec_slf_mask) mel_output, mel_output_postnet, duration_predictor_output, _, _ = result mel_loss = layers.mse_loss(mel_output, mel) mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel) duration_loss = layers.mean( layers.abs( layers.elementwise_sub(duration_predictor_output, alignment))) total_loss = mel_loss + mel_postnet_loss + duration_loss if local_rank == 0: writer.add_scalar('mel_loss', mel_loss.numpy(), global_step) writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step) writer.add_scalar('duration_loss', duration_loss.numpy(), global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) if args.use_data_parallel: total_loss = model.scale_loss(total_loss) total_loss.backward() model.apply_collective_grads() else: total_loss.backward() optimizer.minimize( total_loss, grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[ 'grad_clip_thresh'])) model.clear_gradients() # save checkpoint if local_rank == 0 and global_step % args.save_step == 0: if not os.path.exists(args.save_path): os.mkdir(args.save_path) save_path = os.path.join(args.save_path, 'fastspeech/%d' % global_step) dg.save_dygraph(model.state_dict(), save_path) dg.save_dygraph(optimizer.state_dict(), save_path) if local_rank == 0: writer.close()
import paddle.fluid.layers as layers import paddleslim as slim places = fluid.cpu_places() place = places[0] exe = fluid.Executor(place) train_program = fluid.Program() startup_program = fluid.Program() with fluid.program_guard(train_program, startup_program): with fluid.unique_name.guard(): x = layers.data("x", shape=[-1, 10], dtype="float32") label = layers.data("y", shape=[-1, 1], dtype="float32") loader = fluid.io.DataLoader.from_generator([x, label], 1) y = layers.fc(x, size=1, param_attr="fc.w_0", bias_attr="fc.b_0") loss = layers.mse_loss(y, label) avg_loss = layers.mean(loss) opt = fluid.optimizer.Adam() opt.minimize(avg_loss) exe.run(startup_program) def data_generator(): x_np = np.random.rand(10, 10).astype("float32") y_np = np.random.rand(10, 1).astype("float32") def __generator__(): print("haha") for i in range(0, 10, 2): print(i) yield x_np[i:i + 2], y_np[i:i + 2]
def forward(self, x, y): return L.mse_loss(x, y)