def run(config, num_checkpoints, cuda=False): train_joint_transform_list, train_img_transform, train_label_transform = get_transforms( config, mode="train") val_joint_transform_list, val_img_transform, val_label_transform = None, None, None train_dataset = DataSet(mode="train", joint_transform_list=train_joint_transform_list, img_transform=train_img_transform, label_transform=train_label_transform) val_dataset = DataSet(mode="val", joint_transform_list=val_joint_transform_list, img_transform=val_img_transform, label_transform=val_label_transform) train_loader = data.DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers, drop_last=True) val_loader = data.DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False, num_workers=config.num_workers) criterion, val_criterion = get_loss(config, cuda=cuda) model = get_net(config, criterion, cuda=cuda) checkpoints = get_checkpoints(config, num_checkpoints) print("[*] Checkpoints as follow:") pprint.pprint(checkpoints) util_checkpoint.load_checkpoint(model, None, checkpoints[0]) for i, checkpoint in enumerate(checkpoints[1:]): model2 = get_net(config, criterion, cuda=cuda) util_checkpoint.load_checkpoint(model2, None, checkpoint) swa.moving_average(model, model2, 1. / (i + 2)) with torch.no_grad(): swa.update_bn(train_loader, model, cuda=cuda) output_name = "model-swa.pth" print(f"[*] SAVED: to {output_name}") checkpoint_dir = os.path.join(ROOT_DIR, LOG_DIR, os.path.basename(config.model_dir)) util_checkpoint.save_checkpoint(checkpoint_dir, output_name, model) # test the model scores = validation(config, val_loader, model, val_criterion, "swa", cuda=cuda, is_record=False) print(scores) with open(os.path.join(checkpoint_dir, "swa-scores.json"), "w") as f: json.dump(scores["FWIOU"], f)
def save_periodic_checkpoint(conf, runner, epoch, best_val_metrics): log_file_path = get_periodic_checkpoint_path(conf.run_dir, epoch) if not os.path.isdir(os.path.dirname(log_file_path)): logging.warning(('Skip saving periodic checkpoint: {} does not ' 'exist').format(os.path.dirname(log_file_path))) return logging.info('Saving periodic checkpoint to {}'.format(log_file_path)) save_checkpoint(log_file_path, conf, runner, epoch, best_val_metrics) num_checkpoints = conf.get_attr('num_periodic_checkpoints', default=DEFAULT_NUM_PERIODIC_CHECKPOINTS) prune_checkpoints(os.path.dirname(log_file_path), num_checkpoints)
def save_best_checkpoint(best_dir, best_val, conf, runner, epoch, best_val_metrics): log_file_path = get_best_checkpoint_path(best_dir, epoch, best_val) if not os.path.isdir(os.path.dirname(log_file_path)): print(('Skip saving best value checkpoint: {} does not ' 'exist').format(os.path.dirname(log_file_path))) return print('Saving best value checkpoint to {}'.format(log_file_path)) save_checkpoint(log_file_path, conf, runner, epoch, best_val_metrics) num_checkpoints = conf.get_attr('num_best_checkpoints', default=DEFAULT_NUM_BEST_CHECKPOINTS) prune_checkpoints(os.path.dirname(log_file_path), num_checkpoints)
def train_session(self, model: Tacotron, optimizer: Optimizer, session: TTSSession) -> None: current_step = model.get_step() training_steps = session.max_step - current_step total_iters = len(session.train_set) epochs = training_steps // total_iters + 1 model.r = session.r simple_table([(f'Steps with r={session.r}', str(training_steps // 1000) + 'k Steps'), ('Batch Size', session.bs), ('Learning Rate', session.lr), ('Outputs/Step (r)', model.r)]) for g in optimizer.param_groups: g['lr'] = session.lr loss_avg = Averager() duration_avg = Averager() device = next( model.parameters()).device # use same device as model parameters for e in range(1, epochs + 1): for i, (x, m, ids, x_lens, mel_lens) in enumerate(session.train_set, 1): start = time.time() model.train() x, m = x.to(device), m.to(device) m1_hat, m2_hat, attention = model(x, m) m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) loss = m1_loss + m2_loss optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), hp.tts_clip_grad_norm) optimizer.step() loss_avg.add(loss.item()) step = model.get_step() k = step // 1000 duration_avg.add(time.time() - start) speed = 1. / duration_avg.get() msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {loss_avg.get():#.4} ' \ f'| {speed:#.2} steps/s | Step: {k}k | ' if step % hp.tts_checkpoint_every == 0: ckpt_name = f'taco_step{k}K' save_checkpoint('tts', self.paths, model, optimizer, name=ckpt_name, is_silent=True) if step % hp.tts_plot_every == 0: self.generate_plots(model, session) _, att_score = attention_score(attention, mel_lens) att_score = torch.mean(att_score) self.writer.add_scalar('Attention_Score/train', att_score, model.get_step()) self.writer.add_scalar('Loss/train', loss, model.get_step()) self.writer.add_scalar('Params/reduction_factor', session.r, model.get_step()) self.writer.add_scalar('Params/batch_size', session.bs, model.get_step()) self.writer.add_scalar('Params/learning_rate', session.lr, model.get_step()) stream(msg) val_loss, val_att_score = self.evaluate(model, session.val_set) self.writer.add_scalar('Loss/val', val_loss, model.get_step()) self.writer.add_scalar('Attention_Score/val', val_att_score, model.get_step()) save_checkpoint('tts', self.paths, model, optimizer, is_silent=True) loss_avg.reset() duration_avg.reset() print(' ')
def train_session(self, model: ForwardTacotron, optimizer: Optimizer, session: TTSSession) -> None: current_step = model.get_step() training_steps = session.max_step - current_step total_iters = len(session.train_set) epochs = training_steps // total_iters + 1 simple_table([(f'Steps', str(training_steps // 1000) + 'k Steps'), ('Batch Size', session.bs), ('Learning Rate', session.lr)]) for g in optimizer.param_groups: g['lr'] = session.lr m_loss_avg = Averager() dur_loss_avg = Averager() duration_avg = Averager() pitch_loss_avg = Averager() device = next(model.parameters()).device # use same device as model parameters for e in range(1, epochs + 1): for i, (x, m, ids, x_lens, mel_lens, dur, pitch, puncts) in enumerate( session.train_set, 1 ): start = time.time() model.train() x, m, dur, x_lens, mel_lens, pitch, puncts = ( x.to(device), m.to(device), dur.to(device), x_lens.to(device), mel_lens.to(device), pitch.to(device), puncts.to(device), ) # print("*" * 20) # print(x) # print("*" * 20) m1_hat, m2_hat, dur_hat, pitch_hat = model( x, m, dur, mel_lens, pitch, puncts ) m1_loss = self.l1_loss(m1_hat, m, mel_lens) m2_loss = self.l1_loss(m2_hat, m, mel_lens) dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1), x_lens) pitch_loss = self.l1_loss(pitch_hat, pitch.unsqueeze(1), x_lens) loss = m1_loss + m2_loss + 0.3 * dur_loss + 0.1 * pitch_loss optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), hp.tts_clip_grad_norm) optimizer.step() m_loss_avg.add(m1_loss.item() + m2_loss.item()) dur_loss_avg.add(dur_loss.item()) step = model.get_step() k = step // 1000 duration_avg.add(time.time() - start) pitch_loss_avg.add(pitch_loss.item()) speed = 1. / duration_avg.get() msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \ f'| Dur Loss: {dur_loss_avg.get():#.4} | Pitch Loss: {pitch_loss_avg.get():#.4} ' \ f'| {speed:#.2} steps/s | Step: {k}k | ' if step % hp.forward_checkpoint_every == 0: ckpt_name = f'forward_step{k}K' save_checkpoint('forward', self.paths, model, optimizer, name=ckpt_name, is_silent=True) if step % hp.forward_plot_every == 0: self.generate_plots(model, session) self.writer.add_scalar('Mel_Loss/train', m1_loss + m2_loss, model.get_step()) self.writer.add_scalar('Pitch_Loss/train', pitch_loss, model.get_step()) self.writer.add_scalar('Duration_Loss/train', dur_loss, model.get_step()) self.writer.add_scalar('Params/batch_size', session.bs, model.get_step()) self.writer.add_scalar('Params/learning_rate', session.lr, model.get_step()) stream(msg) m_val_loss, dur_val_loss, pitch_val_loss = self.evaluate(model, session.val_set) self.writer.add_scalar('Mel_Loss/val', m_val_loss, model.get_step()) self.writer.add_scalar('Duration_Loss/val', dur_val_loss, model.get_step()) self.writer.add_scalar('Pitch_Loss/val', pitch_val_loss, model.get_step()) save_checkpoint('forward', self.paths, model, optimizer, is_silent=True) m_loss_avg.reset() duration_avg.reset() pitch_loss_avg.reset() print(' ')
def train_session(self, model: WaveRNN, optimizer: Optimizer, session: VocSession, train_gta: bool) -> None: current_step = model.get_step() training_steps = session.max_step - current_step total_iters = len(session.train_set) epochs = training_steps // total_iters + 1 simple_table([(f'Steps ', str(training_steps // 1000) + 'k'), ('Batch Size', session.bs), ('Learning Rate', session.lr), ('Sequence Length', self.train_cfg['seq_len']), ('GTA Training', train_gta)]) for g in optimizer.param_groups: g['lr'] = session.lr loss_avg = Averager() duration_avg = Averager() device = next( model.parameters()).device # use same device as model parameters for e in range(1, epochs + 1): for i, batch in enumerate(session.train_set, 1): start = time.time() model.train() batch = to_device(batch, device=device) x, y = batch['x'], batch['y'] y_hat = model(x, batch['mel']) if model.mode == 'RAW': y_hat = y_hat.transpose(1, 2).unsqueeze(-1) elif model.mode == 'MOL': y = batch['y'].float() y = y.unsqueeze(-1) loss = self.loss_func(y_hat, y) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_( model.parameters(), self.train_cfg['clip_grad_norm']) optimizer.step() loss_avg.add(loss.item()) step = model.get_step() k = step // 1000 duration_avg.add(time.time() - start) speed = 1. / duration_avg.get() msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {loss_avg.get():#.4} ' \ f'| {speed:#.2} steps/s | Step: {k}k | ' if step % self.train_cfg['gen_samples_every'] == 0: stream(msg + 'generating samples...') gen_result = self.generate_samples(model, session) if gen_result is not None: mel_loss, gen_wav = gen_result self.writer.add_scalar('Loss/generated_mel_l1', mel_loss, model.get_step()) self.track_top_models(mel_loss, gen_wav, model) if step % self.train_cfg['checkpoint_every'] == 0: save_checkpoint(model=model, optim=optimizer, config=self.config, path=self.paths.voc_checkpoints / f'wavernn_step{k}k.pt') self.writer.add_scalar('Loss/train', loss, model.get_step()) self.writer.add_scalar('Params/batch_size', session.bs, model.get_step()) self.writer.add_scalar('Params/learning_rate', session.lr, model.get_step()) stream(msg) val_loss = self.evaluate(model, session.val_set) self.writer.add_scalar('Loss/val', val_loss, model.get_step()) save_checkpoint(model=model, optim=optimizer, config=self.config, path=self.paths.voc_checkpoints / 'latest_model.pt') loss_avg.reset() duration_avg.reset() print(' ')
def tts_train_loop_af_offline(paths: Paths, model: Tacotron, optimizer, train_set, lr, train_steps, attn_example, hp=None): # setattr(model, 'mode', 'attention_forcing') # import pdb def smooth(d, eps=float(1e-10)): u = 1.0 / float(d.size()[2]) return eps * u + (1 - eps) * d device = next( model.parameters()).device # use same device as model parameters for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = train_steps // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss_out, running_loss_attn = 0, 0 # Perform 1 epoch for i, (x, m, ids, _, attn_ref) in enumerate(train_set, 1): # print(x.size()) # print(m.size()) # print(attn_ref.size()) # # print(m1_hat.size(), m2_hat.size()) # # print(attention.size(), attention.size(1)*model.r) # pdb.set_trace() x, m, attn_ref = x.to(device), m.to(device), attn_ref.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: m1_hat, m2_hat, attention = data_parallel_workaround( model, x, m, False, attn_ref) else: m1_hat, m2_hat, attention = model(x, m, generate_gta=False, attn_ref=attn_ref) m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) # attn_loss = F.kl_div(torch.log(smooth(attention)), smooth(attn_ref), reduction='mean') # 'batchmean' attn_loss = F.l1_loss(smooth(attention), smooth(attn_ref)) loss_out = m1_loss + m2_loss loss_attn = attn_loss * hp.attn_loss_coeff loss = loss_out + loss_attn optimizer.zero_grad() loss.backward() if hp.tts_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.tts_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss_out += loss_out.item() avg_loss_out = running_loss_out / i running_loss_attn += loss_attn.item() avg_loss_attn = running_loss_attn / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.tts_checkpoint_every == 0: ckpt_name = f'taco_step{k}K' save_checkpoint('tts', paths, model, optimizer, name=ckpt_name, is_silent=True) if attn_example in ids: idx = ids.index(attn_example) save_attention(np_now(attn_ref[idx][:, :160]), paths.tts_attention / f'{step}_tf') save_attention(np_now(attention[idx][:, :160]), paths.tts_attention / f'{step}_af') save_spectrogram(np_now(m2_hat[idx]), paths.tts_mel_plot / f'{step}', 600) msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss_out: {avg_loss_out:#.4}; Output_attn: {avg_loss_attn:#.4} | {speed:#.2} steps/s | Step: {k}k | ' stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('tts', paths, model, optimizer, is_silent=True) model.log(paths.tts_log, msg) print(' ')
def voc_train_loop(paths: Paths, model: WaveRNN, loss_func, optimizer, train_set, test_set, lr, total_steps): # Use same device as model parameters device = next(model.parameters()).device for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = (total_steps - model.get_step()) // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss = 0. for i, (x, y, m) in enumerate(train_set, 1): x, m, y = x.to(device), m.to(device), y.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: y_hat = data_parallel_workaround(model, x, m) else: y_hat = model(x, m) if model.mode == 'RAW': y_hat = y_hat.transpose(1, 2).unsqueeze(-1) elif model.mode == 'MOL': y = y.float() y = y.unsqueeze(-1) loss = loss_func(y_hat, y) optimizer.zero_grad() loss.backward() if hp.voc_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.voc_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.voc_checkpoint_every == 0: gen_testset(model, test_set, hp.voc_gen_at_checkpoint, hp.voc_gen_batched, hp.voc_target, hp.voc_overlap, paths.voc_output) ckpt_name = f'wave_step{k}K' save_checkpoint('voc', paths, model, optimizer, name=ckpt_name, is_silent=True) msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | ' stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('voc', paths, model, optimizer, is_silent=True) model.log(paths.voc_log, msg) print(' ')
def train_session(self, model: ForwardTacotron, optimizer: Optimizer, session: TTSSession) -> None: current_step = model.get_step() training_steps = session.max_step - current_step total_iters = len(session.train_set) epochs = training_steps // total_iters + 1 simple_table([(f'Steps', str(training_steps // 1000) + 'k Steps'), ('Batch Size', session.bs), ('Learning Rate', session.lr)]) for g in optimizer.param_groups: g['lr'] = session.lr m_loss_avg = Averager() dur_loss_avg = Averager() duration_avg = Averager() pitch_loss_avg = Averager() device = next( model.parameters()).device # use same device as model parameters for e in range(1, epochs + 1): for i, batch in enumerate(session.train_set, 1): batch = to_device(batch, device=device) start = time.time() model.train() pitch_zoneout_mask = torch.rand( batch['x'].size()) > self.train_cfg['pitch_zoneout'] energy_zoneout_mask = torch.rand( batch['x'].size()) > self.train_cfg['energy_zoneout'] pitch_target = batch['pitch'].detach().clone() energy_target = batch['energy'].detach().clone() batch['pitch'] = batch['pitch'] * pitch_zoneout_mask.to( device).float() batch['energy'] = batch['energy'] * energy_zoneout_mask.to( device).float() pred = model(batch) m1_loss = self.l1_loss(pred['mel'], batch['mel'], batch['mel_len']) m2_loss = self.l1_loss(pred['mel_post'], batch['mel'], batch['mel_len']) dur_loss = self.l1_loss(pred['dur'].unsqueeze(1), batch['dur'].unsqueeze(1), batch['x_len']) pitch_loss = self.l1_loss(pred['pitch'], pitch_target.unsqueeze(1), batch['x_len']) energy_loss = self.l1_loss(pred['energy'], energy_target.unsqueeze(1), batch['x_len']) loss = m1_loss + m2_loss \ + self.train_cfg['dur_loss_factor'] * dur_loss \ + self.train_cfg['pitch_loss_factor'] * pitch_loss \ + self.train_cfg['energy_loss_factor'] * energy_loss optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_( model.parameters(), self.train_cfg['clip_grad_norm']) optimizer.step() m_loss_avg.add(m1_loss.item() + m2_loss.item()) dur_loss_avg.add(dur_loss.item()) step = model.get_step() k = step // 1000 duration_avg.add(time.time() - start) pitch_loss_avg.add(pitch_loss.item()) speed = 1. / duration_avg.get() msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \ f'| Dur Loss: {dur_loss_avg.get():#.4} | Pitch Loss: {pitch_loss_avg.get():#.4} ' \ f'| {speed:#.2} steps/s | Step: {k}k | ' if step % self.train_cfg['checkpoint_every'] == 0: save_checkpoint(model=model, optim=optimizer, config=self.config, path=self.paths.forward_checkpoints / f'forward_step{k}k.pt') if step % self.train_cfg['plot_every'] == 0: self.generate_plots(model, session) self.writer.add_scalar('Mel_Loss/train', m1_loss + m2_loss, model.get_step()) self.writer.add_scalar('Pitch_Loss/train', pitch_loss, model.get_step()) self.writer.add_scalar('Energy_Loss/train', energy_loss, model.get_step()) self.writer.add_scalar('Duration_Loss/train', dur_loss, model.get_step()) self.writer.add_scalar('Params/batch_size', session.bs, model.get_step()) self.writer.add_scalar('Params/learning_rate', session.lr, model.get_step()) stream(msg) val_out = self.evaluate(model, session.val_set) self.writer.add_scalar('Mel_Loss/val', val_out['mel_loss'], model.get_step()) self.writer.add_scalar('Duration_Loss/val', val_out['dur_loss'], model.get_step()) self.writer.add_scalar('Pitch_Loss/val', val_out['pitch_loss'], model.get_step()) self.writer.add_scalar('Energy_Loss/val', val_out['energy_loss'], model.get_step()) save_checkpoint(model=model, optim=optimizer, config=self.config, path=self.paths.forward_checkpoints / 'latest_model.pt') m_loss_avg.reset() duration_avg.reset() pitch_loss_avg.reset() print(' ')
def run(cur_gpu, hparams): if hparams.distributed_mode == 'gpus': dist.init_process_group(backend=hparams.dist_backend, init_method=hparams.dist_url, world_size=hparams.world_size, rank=cur_gpu) model = getattr(models, hparams.model_name)(hparams.n_classes, hparams.n_channels, hparams.model_version) if cur_gpu >= 0: torch.cuda.set_device(cur_gpu) model.cuda() if hparams.fp16: model = convert_to_half(model) if hparams.distributed_mode == 'gpus': model = nn.parallel.DistributedDataParallel(model, device_ids=[cur_gpu], output_device=cur_gpu) criterion = cross_entropy params_no_bn, params_no_bn_clone = get_parameters( model, exclude=(nn.BatchNorm2d, nn.SyncBatchNorm, nn.GroupNorm), clone=hparams.fp16) params_bn, params_bn_clone = get_parameters(model, include=(nn.BatchNorm2d, nn.SyncBatchNorm, nn.GroupNorm), clone=hparams.fp16) optimizer = optim.SGD([{ 'params': params_no_bn_clone if hparams.fp16 else params_no_bn, 'weight_decay': hparams.weight_decay }, { 'params': params_bn_clone if hparams.fp16 else params_bn, 'weight_decay': 0.0 }], lr=hparams.initial_learning_rate, momentum=hparams.momentum) lr_scheduler = MultiStepLRWithWarmup(optimizer, hparams.lr_milestones, hparams.lr_warmup_epochs, factor_min=hparams.lr_factor_min, gamma=hparams.lr_decay_rate) best_acc1 = 0 best_acc5 = 0 start_epoch = hparams.start_epoch if hparams.checkpoint and os.path.isfile(hparams.checkpoint): start_epoch, model, optimizer, lr_scheduler, best_acc1, best_acc5 = load_checkpoint( hparams.checkpoint, cur_gpu, model, optimizer, lr_scheduler) torch.backends.cudnn.benchmark = True train_loader, train_sampler = get_train_loader( hparams.data_dir, hparams.image_size, hparams.per_replica_batch_size, hparams.n_data_loading_workers, hparams.distributed_mode, hparams.world_size, cur_gpu) val_loader = get_val_loader(hparams.data_dir, hparams.image_size, hparams.per_replica_batch_size, hparams.n_data_loading_workers, hparams.distributed_mode, hparams.world_size, cur_gpu) if hparams.evaluate: return validate(cur_gpu, val_loader, model, criterion, 0, hparams) monitor = get_progress_monitor(cur_gpu, hparams.log_dir, hparams.steps_per_epoch, hparams.epochs, hparams.print_freq, start_epoch) for epoch in range(start_epoch, hparams.epochs): monitor and monitor.before_epoch() if train_sampler: train_sampler.set_epoch(epoch) train(cur_gpu, train_loader, model, criterion, optimizer, lr_scheduler, params_no_bn + params_bn, params_no_bn_clone + params_bn_clone, epoch, hparams, monitor) loss, acc1, acc5 = validate(cur_gpu, val_loader, model, criterion, epoch, hparams) monitor and monitor.after_epoch(loss, acc1, acc5) if hparams.save_model and cur_gpu in (-1, 0): is_best = acc1 > best_acc1 best_acc1 = acc1 if is_best else best_acc1 save_checkpoint(hparams.model_dir, epoch, model, optimizer, lr_scheduler, best_acc1, best_acc5, is_best) if hparams.distributed_mode == 'gpus': dist.destroy_process_group() monitor and monitor.end()
def voc_train_loop(paths: Paths, model: WaveRNN, loss_func, optimizer, train_set, test_set, init_lr, final_lr, total_steps): # Use same device as model parameters device = next(model.parameters()).device # for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = (total_steps - model.get_step()) // total_iters + 1 for e in range(1, epochs + 1): adjust_learning_rate(optimizer, e, epochs, init_lr, final_lr) # 初始学习率与最终学习率-Begee start = time.time() running_loss = 0. for i, (x, y, m) in enumerate(train_set, 1): x, m, y = x.to(device), m.to(device), y.to( device) # x/y: (Batch, sub_bands, T) ######################### MultiBand-WaveRNN ######################### if hp.voc_multiband: y0 = y[:, 0, :].squeeze(0).unsqueeze( -1) # y0/y1/y2/y3: (Batch, T, 1) y1 = y[:, 1, :].squeeze(0).unsqueeze(-1) y2 = y[:, 2, :].squeeze(0).unsqueeze(-1) y3 = y[:, 3, :].squeeze(0).unsqueeze(-1) y_hat = model(x, m) # (Batch, T, num_classes, sub_bands) if model.mode == 'RAW': y_hat0 = y_hat[:, :, :, 0].transpose(1, 2).unsqueeze( -1) # (Batch, num_classes, T, 1) y_hat1 = y_hat[:, :, :, 1].transpose(1, 2).unsqueeze(-1) y_hat2 = y_hat[:, :, :, 2].transpose(1, 2).unsqueeze(-1) y_hat3 = y_hat[:, :, :, 3].transpose(1, 2).unsqueeze(-1) elif model.mode == 'MOL': y0 = y0.float() y1 = y1.float() y2 = y2.float() y3 = y3.float() loss = loss_func(y_hat0, y0) + loss_func( y_hat1, y1) + loss_func(y_hat2, y2) + loss_func( y_hat3, y3) ######################### MultiBand-WaveRNN ######################### optimizer.zero_grad() loss.backward() if hp.voc_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.voc_clip_grad_norm).cpu() if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.voc_checkpoint_every == 0: gen_testset(model, test_set, hp.voc_gen_at_checkpoint, hp.voc_gen_batched, hp.voc_target, hp.voc_overlap, paths.voc_output) ckpt_name = f'wave_step{k}K' save_checkpoint('voc', paths, model, optimizer, name=ckpt_name, is_silent=True) msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | ' stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('voc', paths, model, optimizer, is_silent=True) model.log(paths.voc_log, msg) print(' ')
def train_loop(paths: Paths, model, optimizer, train_set, lr, train_steps, mel_example): device = next( model.parameters()).device # use same device as model parameters for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = train_steps // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss = 0 dur_running_loss = 0 # Perform 1 epoch for i, (x, m, ids, mel_len, dur) in enumerate(train_set, 1): x, m, dur = x.to(device), m.to(device), dur.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: m_hat, m_post_hat, dur_hat = data_parallel_workaround( model, x, m, dur) else: m_hat, m_post_hat, dur_hat = model(x, m, dur) lin_loss = F.l1_loss(m_hat, m) post_loss = F.l1_loss(m_post_hat, m) dur_loss = F.l1_loss(dur_hat, dur) loss = lin_loss + post_loss + dur_loss optimizer.zero_grad() loss.backward() if hp.forward_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.forward_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss += post_loss.item() avg_loss = running_loss / i dur_running_loss += dur_loss.item() dur_avg_loss = dur_running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.forward_checkpoint_every == 0: ckpt_name = f'fast_speech_step{k}K' save_checkpoint('forward', paths, model, optimizer, name=ckpt_name, is_silent=True) if mel_example in ids: idx = ids.index(mel_example) try: seq = x[idx].tolist() m_gen = model.generate(seq) save_spectrogram(m_gen, paths.forward_mel_plot / f'{step}_gen', 600) except Exception: traceback.print_exc() save_spectrogram(np_now(m_post_hat[idx]), paths.forward_mel_plot / f'{step}_gta', 600) save_spectrogram(np_now(m[idx]), paths.forward_mel_plot / f'{step}_target', 600) msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {avg_loss:#.4} ' \ f'| Duration Loss: {dur_avg_loss:#.4} | {speed:#.2} steps/s | Step: {k}k | ' stream(msg) model.log(paths.forward_log, msg) save_checkpoint('forward', paths, model, optimizer, is_silent=True)
def tts_train_loop(paths: Paths, model: Tacotron, optimizer, train_set, lr, train_steps, attn_example): device = next( model.parameters()).device # use same device as model parameters for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = train_steps // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss = 0 # Perform 1 epoch for i, (x, m, ids, _, att_guides) in enumerate(train_set, 1): x, m = x.to(device), m.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: m1_hat, m2_hat, attention, r = data_parallel_workaround( model, x, m) else: m1_hat, m2_hat, attention, r = model(x, m) #print(att_guides.shape) orig_attention = attention n = int(len(att_guides[0]) / r) #print("n", n) #reduce guide by r factor ga = [a[t] for a in att_guides for t in range(0, len(a), r)] assert n == len(attention[0]) guided_attention = [ga[k:k + n] for k in range(0, len(ga), n)] attention = np_now(attention) attention = [ pad2d_nonzero(x, n, len(att_guides[0][0])) for x in attention ] guided_attention = torch.tensor(guided_attention) guided_attention = guided_attention.to(device) attention = torch.tensor(attention) attention = attention.to(device) #create attention mask attention_masks = torch.ne(attention, -1).type(torch.FloatTensor) attention_masks = torch.tensor(attention_masks) attention_masks = attention.to(device) multiply = torch.abs( attention * guided_attention) * attention_masks attention_loss = torch.sum(multiply) mask_sum = torch.sum(attention_masks) attention_loss /= mask_sum m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) #print("mask sum", mask_sum) #print("attention loss", attention_loss) #print("m losses", m1_loss, m2_loss) prev_loss = m1_loss + m2_loss #print("prev loss", prev_loss) loss = m1_loss + m2_loss + attention_loss #print("loss + att", loss) optimizer.zero_grad() loss.backward() if hp.tts_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.tts_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.tts_checkpoint_every == 0: ckpt_name = f'taco_step{k}K' save_checkpoint('tts', paths, model, optimizer, name=ckpt_name, is_silent=True) if attn_example in ids: idx = ids.index(attn_example) save_attention(np_now(orig_attention[idx][:, :160]), paths.tts_attention / f'{step}') save_spectrogram(np_now(m2_hat[idx]), paths.tts_mel_plot / f'{step}', 600) msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:#.4} | {speed:#.2} steps/s | Step: {k}k | ' stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('tts', paths, model, optimizer, is_silent=True) model.log(paths.tts_log, msg) print(' ')
def tts_train_loop(paths: Paths, model: Tacotron, optimizer, train_set, lr, train_steps, attn_example): device = next( model.parameters()).device # use same device as model parameters for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = train_steps // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss = 0 # Perform 1 epoch for i, (x, m, ids, _) in enumerate(train_set, 1): x, m = x.to(device), m.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: m1_hat, m2_hat, attention = data_parallel_workaround( model, x, m) else: m1_hat, m2_hat, attention = model(x, m) m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) loss = m1_loss + m2_loss optimizer.zero_grad() loss.backward() if hp.tts_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.tts_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.tts_checkpoint_every == 0: ckpt_name = 'taco_step%sK' % (repr1(k)) save_checkpoint('tts', paths, model, optimizer, name=ckpt_name, is_silent=True) if attn_example in ids: idx = ids.index(attn_example) save_attention(np_now(attention[idx][:, :160]), paths.tts_attention / '%s' % (repr1(step))) save_spectrogram(np_now(m2_hat[idx]), paths.tts_mel_plot / '%s' % (repr1(step)), 600) msg = '| Epoch: %s/%s (%s/%s) | Loss: %.4f | %.2f steps/s | Step: %sk | ' % ( repr1(e), repr1(epochs), repr1(i), repr1(total_iters), avg_loss, speed, repr1(k)) stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('tts', paths, model, optimizer, is_silent=True) model.log(paths.tts_log, msg) print(' ')
def voc_train_loop(paths: Paths, model: WaveRNN, loss_func, optimizer, train_set, test_set, lr, total_steps): # Use same device as model parameters device = next(model.parameters()).device # set learning rate for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = (total_steps - model.get_step()) // total_iters + 1 total_number_of_batches = len(train_set) writer = SummaryWriter("runs/{0}-{1}".format( model_name_prefix, datetime.now().strftime("%Y%m%d-%H%M%S"))) scheduler = StepLR(optimizer, step_size=1, gamma=0.983) for e in range(EPOCH, epochs + 1): start = time.time() running_loss = 0. avg_loss = 0 for i, (x, y, m) in enumerate(train_set, 1): x, m, y = x.to(device), m.to(device), y.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: y_hat = data_parallel_workaround(model, x, m) else: y_hat = model(x, m) if model.mode == 'RAW': y_hat = y_hat.transpose(1, 2).unsqueeze(-1) elif model.mode == 'MOL': y = y.float() y = y.unsqueeze(-1) loss = loss_func(y_hat, y) optimizer.zero_grad() loss.backward() if hp.voc_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.voc_clip_grad_norm) optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 # Write to tensorboard per batch writer.add_scalar('Epoch loss', loss.item(), e * total_number_of_batches + i) msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | ' stream(msg) """ ####################### Testing ############################ torch.cuda.empty_cache() loss_test = 0 for _, (x_test, y_test, m_test) in enumerate(test_set, 1): x_test, m_test, y_test = x_test.to(device), m_test.to(device), y_test.to(device) if device.type == 'cuda' and torch.cuda.device_count() > 1: raise RuntimeError("Unsupported") else: y_test_hat = model(x_test, m_test) if model.mode == 'RAW': y_test_hat = y_test_hat.transpose(1, 2).unsqueeze(-1) elif model.mode == 'MOL': y_test = y_test.float() y_test = y_test.unsqueeze(-1) loss_test += loss_func(y_test_hat, y_test).item() avg_loss_test = loss_test / len(test_set) msg = f'| Epoch: {e}/{epochs} | Test-Loss: {loss_test:.4f} | Test-AvgLoss: {avg_loss_test:.4f} | ' stream("\n") stream(msg) writer.add_scalar('Test loss', loss_test, e) writer.add_scalar('Average test loss', avg_loss_test, e) ############################################################ """ # Write to tensorboard per epoch writer.add_scalar('Running loss', running_loss, e) writer.add_scalar('Average loss', avg_loss, e) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('voc', paths, model, optimizer, name="{0}-epoch-{1}-loss-{2}".format( model_name_prefix, e, avg_loss), is_silent=True) model.log(paths.voc_log, msg) print(' ') scheduler.step() print('Epoch:', e, 'LR:', scheduler.get_lr())
def main(): # Make some variable global global args, train_csv, test_csv, exp_dir, best_result, device, tb_writer, tb_freq # Args parser args = args_parser() start_epoch = 0 ############ EVALUATE MODE ############ if args.evaluate: # Evaluate mode print('\n==> Evaluation mode!') # Define paths chkpt_path = args.evaluate # Check that the checkpoint file exist assert os.path.isfile( chkpt_path), "- No checkpoint found at: {}".format(chkpt_path) # Experiment director exp_dir = os.path.dirname(os.path.abspath(chkpt_path)) sys.path.append(exp_dir) # Load checkpoint print('- Loading checkpoint:', chkpt_path) # Load the checkpoint checkpoint = torch.load(chkpt_path) # Assign some local variables args = checkpoint['args'] start_epoch = checkpoint['epoch'] best_result = checkpoint['best_result'] print('- Checkpoint was loaded successfully.') # Compare the checkpoint args with the json file in case I wanted to change some args compare_args_w_json(args, exp_dir, start_epoch + 1) args.evaluate = chkpt_path device = torch.device( "cuda:" + str(args.gpu) if torch.cuda.is_available() else "cpu") model = checkpoint['model'].to(device) print_args(args) _, val_loader = create_dataloader(args, eval_mode=True) loss = get_loss_fn(args).to(device) evaluate_epoch(val_loader, model, loss, start_epoch) return # End program ############ RESUME MODE ############ elif args.resume: # Resume mode print('\n==> Resume mode!') # Define paths chkpt_path = args.resume assert os.path.isfile( chkpt_path), "- No checkpoint found at: {}".format(chkpt_path) # Experiment directory exp_dir = os.path.dirname(os.path.abspath(chkpt_path)) sys.path.append(exp_dir) # Load checkpoint print('- Loading checkpoint:', chkpt_path) checkpoint = torch.load(chkpt_path) args = checkpoint['args'] start_epoch = checkpoint['epoch'] + 1 best_result = checkpoint['best_result'] print('- Checkpoint ({}) was loaded successfully!\n'.format( checkpoint['epoch'])) # Compare the checkpoint args with the json file in case I wanted to change some args compare_args_w_json(args, exp_dir, start_epoch) args.resume = chkpt_path device = torch.device( "cuda:" + str(args.gpu) if torch.cuda.is_available() else "cpu") model = checkpoint['model'].to(device) optimizer = checkpoint['optimizer'] print_args(args) train_loader, val_loader = create_dataloader(args, eval_mode=False) ############ NEW EXP MODE ############ else: # New Exp print('\n==> Starting a new experiment "{}" \n'.format(args.exp)) # Check if experiment exists ws_path = os.path.join('workspace/', args.workspace) exp = args.exp exp_dir = os.path.join(ws_path, exp) assert os.path.isdir(exp_dir), '- Experiment "{}" not found!'.format( exp) # Which device to use device = torch.device( "cuda:" + str(args.gpu) if torch.cuda.is_available() else "cpu") # Add the experiment's folder to python path sys.path.append(exp_dir) print_args(args) # Create dataloader train_loader, val_loader = create_dataloader(args, eval_mode=False) # import the model f = importlib.import_module('network') model = f.CNN().to(device) print('\n==> Model "{}" was loaded successfully!'.format( model.__name__)) # Optimize only parameters that requires_grad parameters = filter(lambda p: p.requires_grad, model.parameters()) # Create Optimizer if args.optimizer.lower() == 'sgd': optimizer = SGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adam': optimizer = Adam(parameters, lr=args.lr, weight_decay=args.weight_decay, amsgrad=True) ############ IF RESUME/NEW EXP ############ # Error metrics that are set to the worst best_result = create_error_metric(args) best_result.set_to_worst() # Tensorboard tb = args.tb_log if hasattr(args, 'tb_log') else False tb_freq = args.tb_freq if hasattr(args, 'tb_freq') else 1000 tb_writer = None if tb: tb_writer = SummaryWriter( os.path.join( exp_dir, 'tb_log', datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))) # Create Loss loss = get_loss_fn(args).to(device) # Define Learning rate decay lr_decayer = lr_scheduler.StepLR(optimizer, step_size=args.lr_decay_step, gamma=args.lr_decay_factor, last_epoch=start_epoch - 1) # Create or Open Logging files train_csv = LogFile(os.path.join(exp_dir, 'train.csv'), args) test_csv = LogFile(os.path.join(exp_dir, 'test.csv'), args) best_txt = os.path.join(exp_dir, 'best.txt') save_args(exp_dir, args) # Save args to JSON file ############ TRAINING LOOP ############ for epoch in range(start_epoch, args.epochs): print('\n==> Training Epoch [{}] (lr={})'.format( epoch, optimizer.param_groups[0]['lr'])) train_err_avg = train_epoch(train_loader, model, optimizer, loss, epoch) # Learning rate scheduler lr_decayer.step() train_csv.update_log(train_err_avg, epoch) # Save checkpoint in case evaluation crashed save_checkpoint( { 'args': args, 'epoch': epoch, 'model': model, 'best_result': best_result, 'optimizer': optimizer, }, False, epoch, exp_dir) # Evaluate the trained epoch test_err_avg, out_image = evaluate_epoch( val_loader, model, loss, epoch) # evaluate on validation set # Evaluate Uncerainty ause = None if args.eval_uncert: if args.loss == 'masked_prob_loss_var': ause, ause_fig = eval_ause(model, val_loader, args, epoch, uncert_type='v') else: ause, ause_fig = eval_ause(model, val_loader, args, epoch, uncert_type='c') # Log to tensorboard if enabled if tb_writer is not None: avg_meter = test_err_avg.get_avg() tb_writer.add_scalar('Loss/val', avg_meter.loss, epoch) tb_writer.add_scalar('MAE/val', avg_meter.metrics['mae'], epoch) tb_writer.add_scalar('RMSE/val', avg_meter.metrics['rmse'], epoch) if ause is not None: tb_writer.add_scalar('AUSE/val', ause, epoch) tb_writer.add_images( 'Prediction', colored_depthmap_tensor(out_image[:, :1, :, :]), epoch) tb_writer.add_images( 'Input_Conf_Log_Scale', colored_depthmap_tensor(torch.log(out_image[:, 2:, :, :] + 1)), epoch) tb_writer.add_images( 'Output_Conf_Log_Scale', colored_depthmap_tensor(torch.log(out_image[:, 1:2, :, :] + 1)), epoch) tb_writer.add_figure('Sparsification_Plot', ause_fig, epoch) # Update Log files test_csv.update_log(test_err_avg, epoch, ause) # Save best model # TODO: How to decide the best based on dataset? is_best = test_err_avg.metrics['rmse'] < best_result.metrics['rmse'] if is_best: best_result = test_err_avg # Save the new best locally test_err_avg.print_to_txt(best_txt, epoch) # Print to a text file # Save it again if it is best checkpoint save_checkpoint( { 'args': args, 'epoch': epoch, 'model': model, 'best_result': best_result, 'optimizer': optimizer, }, is_best, epoch, exp_dir)
def train_session(self, model_tts: ForwardTacotron, model_asr: Wav2Vec2ForCTC, optimizer_tts: Optimizer, tts_session: ForwardSession, asr_session: ASRSession, asr_trainer, optimizer_asr) -> None: # print(tts_session.path) # exit() asr_trainer_state = {'logs': []} current_step = model_tts.get_step() tts_training_steps = tts_session.max_step - current_step try: _, asr_current_step = get_last_checkpoint( './checkpoints/sme_speech_tts.asr_forward/', 'model_at') asr_training_steps = tts_session.max_step - asr_current_step except: asr_current_step = 0 asr_training_steps = tts_training_steps total_iters = len(tts_session.train_set) epochs = tts_training_steps // total_iters + 1 simple_table([ ('TTS Steps', str(tts_training_steps // 1000) + 'k Steps'), ('ASR Steps', str(asr_training_steps // 1000) + 'k Steps'), ('Batch Size TTS', tts_session.bs), ('Learning Rate', tts_session.lr) ]) for g in optimizer_tts.param_groups: g['lr'] = tts_session.lr m_loss_avg = Averager() dur_loss_avg = Averager() duration_avg = Averager() device = next(model_tts.parameters() ).device # use same device as model parameters warnings.filterwarnings('ignore', category=UserWarning) for e in range(1, epochs + 1): #tts train loop for epoch for i, (x, m, ids, x_lens, mel_lens, dur) in enumerate(tts_session.train_set, 1): start = time.time() model_tts.train() x, m, dur, x_lens, mel_lens = x.to(device), m.to(device), dur.to(device),\ x_lens.to(device), mel_lens.to(device) m1_hat, m2_hat, dur_hat = model_tts(x, m, dur, mel_lens) m1_loss = self.l1_loss(m1_hat, m, mel_lens) m2_loss = self.l1_loss(m2_hat, m, mel_lens) dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1), x_lens) tts_s_loss = m1_loss + m2_loss + 0.1 * dur_loss optimizer_tts.zero_grad() # tts_s_loss.backward() torch.nn.utils.clip_grad_norm_(model_tts.parameters(), hp.tts_clip_grad_norm) # optimizer_tts.step() m_loss_avg.add(m1_loss.item() + m2_loss.item()) dur_loss_avg.add(dur_loss.item()) step = model_tts.get_step() k = step // 1000 duration_avg.add(time.time() - start) # pitch_loss_avg.add(pitch_loss.item()) speed = 1. / duration_avg.get() msg_tts = f'| TTS MODEL (supervised training ): '\ f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \ f'| Dur Loss: {dur_loss_avg.get():#.4} ' \ f'| {speed:#.2} steps/s | Step: {k}k | ' if step % hp.forward_checkpoint_every == 0: ckpt_name = f'forward_step{k}K' save_checkpoint('forward', self.paths, model_tts, optimizer_tts, name=ckpt_name, is_silent=True) if step % hp.forward_plot_every == 0: self.generate_plots(model_tts, tts_session) self.writer.add_scalar('Mel_Loss/train', m1_loss + m2_loss, model_tts.get_step()) self.writer.add_scalar('Duration_Loss/train', dur_loss, model_tts.get_step()) self.writer.add_scalar('Params/batch_size', tts_session.bs, model_tts.get_step()) self.writer.add_scalar('Params/learning_rate', tts_session.lr, model_tts.get_step()) stream(msg_tts) # print(msg_tts) # print(torch.cuda.memory_allocated(device=device)) # model_tts = model_tts.to('cpu') for step, inputs in enumerate(asr_session.train_set): optimizer_asr.zero_grad() model_asr.to(device) for k, v in inputs.items(): if isinstance(v, torch.Tensor): inputs[k] = v.to(device) model_asr.train() outputs = model_asr(**inputs) asr_s_loss = outputs["loss"] if isinstance( outputs, dict) else outputs[0] # asr_s_loss = asr_s_loss.mean() msg_asr = f'| ASR MODEL (supervised training) : '\ f'| Epoch: {e}/{epochs} ({step}/{len(asr_session.train_set)}) | Loss ASR: {asr_s_loss:#.4} '\ f' ||||||||||||||||||||||' stream(msg_asr) # # model_asr.to('cuda') m_val_loss, dur_val_loss = self.evaluate(model_tts, tts_session.val_set) eval_tts_msg = f'| TTS MODEL (supervised eval ): '\ f'| Epoch: {e}/{epochs} | Val Loss: {m_val_loss:#.4} ' \ f'| Dur Val Loss: {dur_val_loss:#.4} ' \ stream(eval_tts_msg) tts_eval_loss = m_val_loss + dur_val_loss # print(eval_tts_msg) # ASR eval supervised print('\nEvaluating ASR model ...') # model_asr.to('cpu') asr_eval_loss = 0 eval_wer = 0 for step, inputs in enumerate(asr_session.test_set): asr_eval_loss_i, logits_a, labels_a = asr_trainer.prediction_step( model_asr, inputs, False) asr_eval_loss += asr_eval_loss_i logits_a.to('cpu') eval_wer_i = asr_trainer.compute_metrics( EvalPrediction(predictions=logits_a, label_ids=labels_a)) eval_wer += eval_wer_i['wer'] # print(eval_wer) eval_wer = eval_wer / step asr_eval_loss = asr_eval_loss / step msg_asr_eval = f'| ASR MODEL (supervised eval) : Epoch {e}/{epochs} | Loss ASR: {asr_eval_loss:#.4} | WER: {eval_wer} |||||||||||||||||||||||||||||||||||||||||||||||||||||' stream(msg_asr_eval) # dual transformation loop # tts_s_loss = 3 # asr_s_loss = 1 tts_u_loss, asr_u_loss = self.dual_transform( model_tts, model_asr, optimizer_tts, optimizer_asr, asr_session.test_set, m_loss_avg, dur_loss_avg, device, asr_current_step, e, epochs, duration_avg, total_iters, tts_s_loss, asr_s_loss, tts_session.lr, tts_session.path) step += 1 asr_path = f'checkpoint-27364' modelasr_folder = './checkpoints/sme_speech_tts.asr_forward/' new_check = modelasr_folder + asr_path os.makedirs(new_check, exist_ok=True) # asr_path, asr_step = get_last_checkpoint(modelasr_folder, modelasr_name) save_checkpoint('forward', self.paths, model_tts, optimizer_tts, is_silent=True) # asr_u_loss = 2 if "logs" not in asr_trainer_state: asr_trainer_state['logs'] = [] asr_trainer_state['logs'].append({ 'step': step, 'epoch': e, 'asr_s_loss': int(asr_s_loss), 'asr_u_loss': int(asr_u_loss), 'tts_s_loss': int(tts_s_loss), 'tts_u_loss': int(tts_u_loss), 'tts_eval_loss': int(tts_eval_loss), 'asr_eval_loss': int(asr_eval_loss), 'eval_wer': eval_wer }) with open(f'{modelasr_folder+ asr_path}/dt_trainer_state.json', 'w') as f: json.dump(asr_trainer_state, f) model_asr.save_pretrained(f'{new_check}') torch.save(optimizer_asr.state_dict(), f'{new_check}/optimizer.pt') print("Exiting due to cuda OOM!") exit(11)
def run(cur_gpu, hparams): if hparams.distributed_mode == 'gpus': dist.init_process_group(backend=hparams.dist_backend, init_method=hparams.dist_url, world_size=hparams.world_size, rank=cur_gpu) if cur_gpu >= 0: torch.cuda.set_device(cur_gpu) model = getattr(models, hparams.model_name)(hparams, use_cuda=True, use_fp16=hparams.fp16) model.cuda() else: model = getattr(models, hparams.model_name)(hparams) if hparams.fp16: model = convert_to_half(model) if hparams.distributed_mode == 'gpus': model = nn.parallel.DistributedDataParallel(model, device_ids=[cur_gpu], output_device=cur_gpu, find_unused_parameters=True) criterion = cross_entropy params, params_clone = get_parameters(model, clone=hparams.fp16) optimizer = optim.SGD([ {'params': params_clone if hparams.fp16 else params, 'weight_decay': hparams.weight_decay}, ], lr=hparams.initial_learning_rate, momentum=hparams.momentum) lr_scheduler = get_lr_scheduler(hparams.lr_scheduler, optimizer, hparams) best_acc1 = 0 best_acc5 = 0 start_epoch = hparams.start_epoch if hparams.checkpoint and os.path.isfile(hparams.checkpoint): start_epoch, model, optimizer, lr_scheduler, best_acc1, best_acc5 = load_checkpoint( hparams.checkpoint, cur_gpu, model, optimizer, lr_scheduler) torch.backends.cudnn.benchmark = True train_loader, train_sampler = get_train_loader(hparams.data_dir, hparams.image_size, hparams.per_replica_batch_size, hparams.n_data_loading_workers, hparams.distributed_mode, hparams.world_size, cur_gpu) val_loader = get_val_loader(hparams.data_dir, hparams.image_size, hparams.per_replica_batch_size, hparams.n_data_loading_workers, hparams.distributed_mode, hparams.world_size, cur_gpu) if hparams.evaluate: return validate(cur_gpu, val_loader, model, criterion, 0, hparams) monitor = get_monitor() for epoch in range(start_epoch, hparams.epochs): if cur_gpu == -1 or cur_gpu == 0: print('Epoch %d\n' % (epoch + 1)) monitor and monitor.before_epoch() if train_sampler: train_sampler.set_epoch(epoch) train(cur_gpu, train_loader, model, criterion, optimizer, lr_scheduler, params, params_clone, epoch, hparams) loss, acc1, acc5 = validate(cur_gpu, val_loader, model, criterion, epoch, hparams) monitor and monitor.after_epoch(loss, acc1, acc5) if hparams.save_model and cur_gpu in (-1, 0): is_best = acc1 > best_acc1 best_acc1 = acc1 if is_best else best_acc1 save_checkpoint(hparams.model_dir, epoch, model, optimizer, lr_scheduler, best_acc1, best_acc5, is_best) if hparams.distributed_mode == 'gpus': dist.destroy_process_group()
def tts_train_loop(paths: Paths, model: Tacotron, optimizer, train_set, lr, train_steps, attn_example, max_y, max_x): device = next( model.parameters()).device # use same device as model parameters for g in optimizer.param_groups: g['lr'] = lr total_iters = len(train_set) epochs = train_steps // total_iters + 1 for e in range(1, epochs + 1): start = time.time() running_loss = 0 # Perform 1 epoch for i, (x, m, ids, _, padded_att_guides) in enumerate(train_set, 1): x, m = x.to(device), m.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == 'cuda' and torch.cuda.device_count() > 1: m1_hat, m2_hat, attention, r = data_parallel_workaround( model, x, m) else: m1_hat, m2_hat, attention, r = model(x, m) reduced_guides = [] att_guide_path = hp.attention_path for j, item_id in enumerate(ids): att = np.load(f'{att_guide_path}/{item_id}.npy') reduced = att[0::r] pred_attention = attention[j] n_frames = pred_attention.shape[0] n_phones = pred_attention.shape[-1] # pred_attention = torch.tensor(pred_attention) # reduced = torch.tensor(reduced) padded_guides = pad2d_nonzero(reduced, n_frames, n_phones) #padded_guides = torch.tensor(padded_guides) reduced_guides.append(padded_guides) reduced_guides = torch.tensor(reduced_guides) mask = torch.ne(reduced_guides, -1).type(torch.FloatTensor) mask = torch.tensor(mask) padded_guides = [ pad2d_zero(x, n_frames, n_phones) for x in reduced_guides ] padded_guides = torch.tensor(padded_guides) padded_guides = padded_guides.to(device) attention = attention.to(device) mask = mask.to(device) attention = attention * mask print("guide att shape", att.shape) print(att) print("reduced guide", padded_guides.shape) # print("attention size",n_frames, n_phones) print("mask", mask.shape) print(mask) print(padded_guides.shape, attention.shape, mask.shape) print(attention) print(padded_guides) multiply = torch.pow((attention - padded_guides), 2) print(multiply) #multiply = torch.pow((pred_attention - padded_guides),2)* mask #print(multiply) attention_loss = torch.sum(multiply) print(attention_loss) mask_sum1 = torch.sum(mask) attention_loss /= mask_sum1 print(attention_loss) # batch_attention_losses.append(attention_loss) m1_loss = F.l1_loss(m1_hat, m) m2_loss = F.l1_loss(m2_hat, m) #average_att_loss = sum(batch_attention_losses)/len(batch_attention_losses) #print("attention loss", average_att_loss) #print("m losses", m1_loss, m2_loss) prev_loss = m1_loss + m2_loss print("prev loss", prev_loss) loss = m1_loss + m2_loss + attention_loss print("loss + att", loss) #exit() optimizer.zero_grad() loss.backward() if hp.tts_clip_grad_norm is not None: grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hp.tts_clip_grad_norm) if np.isnan(grad_norm): print('grad_norm was NaN!') optimizer.step() running_loss += loss.item() avg_loss = running_loss / i speed = i / (time.time() - start) step = model.get_step() k = step // 1000 if step % hp.tts_checkpoint_every == 0: ckpt_name = f'taco_step{k}K' save_checkpoint('tts', paths, model, optimizer, name=ckpt_name, is_silent=True) if attn_example in ids: idx = ids.index(attn_example) save_attention(np_now(attention[idx][:, :160]), paths.tts_attention / f'{step}') save_spectrogram(np_now(m2_hat[idx]), paths.tts_mel_plot / f'{step}', 600) msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:#.4} | {speed:#.2} steps/s | Step: {k}k | ' stream(msg) # Must save latest optimizer state to ensure that resuming training # doesn't produce artifacts save_checkpoint('tts', paths, model, optimizer, is_silent=True) model.log(paths.tts_log, msg) print(' ')