def synthesis(text_input, args): local_rank = dg.parallel.Env().local_rank place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()) fluid.enable_dygraph(place) with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) # tensorboard if not os.path.exists(args.output): os.mkdir(args.output) writer = SummaryWriter(os.path.join(args.output, 'log')) model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels']) # Load parameters. global_step = io.load_parameters(model=model, checkpoint_path=args.checkpoint) model.eval() text = np.asarray(text_to_sequence(text_input)) text = np.expand_dims(text, axis=0) pos_text = np.arange(1, text.shape[1] + 1) pos_text = np.expand_dims(pos_text, axis=0) text = dg.to_variable(text) pos_text = dg.to_variable(pos_text) _, mel_output_postnet = model(text, pos_text, alpha=args.alpha) result = np.exp(mel_output_postnet.numpy()) mel_output_postnet = fluid.layers.transpose( fluid.layers.squeeze(mel_output_postnet, [0]), [1, 0]) mel_output_postnet = np.exp(mel_output_postnet.numpy()) basis = librosa.filters.mel(cfg['audio']['sr'], cfg['audio']['n_fft'], cfg['audio']['num_mels']) inv_basis = np.linalg.pinv(basis) spec = np.maximum(1e-10, np.dot(inv_basis, mel_output_postnet)) # synthesis use clarinet wav_clarinet = synthesis_with_clarinet(args.config_clarinet, args.checkpoint_clarinet, result, place) writer.add_audio(text_input + '(clarinet)', wav_clarinet, 0, cfg['audio']['sr']) if not os.path.exists(os.path.join(args.output, 'samples')): os.mkdir(os.path.join(args.output, 'samples')) write(os.path.join(os.path.join(args.output, 'samples'), 'clarinet.wav'), cfg['audio']['sr'], wav_clarinet) #synthesis use griffin-lim wav = librosa.core.griffinlim(spec**cfg['audio']['power'], hop_length=cfg['audio']['hop_length'], win_length=cfg['audio']['win_length']) writer.add_audio(text_input + '(griffin-lim)', wav, 0, cfg['audio']['sr']) write( os.path.join(os.path.join(args.output, 'samples'), 'grinffin-lim.wav'), cfg['audio']['sr'], wav) print("Synthesis completed !!!") writer.close()
def main(input_files, model_dir, output_dir, batch_size, implementation, data_config, audio_config, preload_mels=False): model_filename = get_latest_checkpoint(model_dir) print("Model path: {}".format(model_filename)) model = torch.load(model_filename)['model'] wavenet = nv_wavenet.NVWaveNet(**(model.export_weights())) print("Wavenet num layers: {}, max_dilation: {}".format( wavenet.num_layers, wavenet.max_dilation)) writer = SummaryWriter(output_dir) mel_extractor = Mel2SampOnehot(audio_config=audio_config, **data_config) input_files = utils.files_to_list(input_files) audio_processor = AudioProcessor(audio_config) for j, files in enumerate(chunker(input_files, batch_size)): mels = [] for i, file_path in enumerate(files): if preload_mels: mel = np.load(file_path[0]).T mel = torch.from_numpy(mel) mel = utils.to_gpu(mel) else: audio, _ = utils.load_wav_to_torch(file_path) file_name = os.path.splitext(os.path.basename(file_path))[0] writer.add_audio("eval_true/{}/{}".format(i, file_name), audio / utils.MAX_WAV_VALUE, 0, 22050) mel = mel_extractor.get_mel(audio) mel = mel.t().cuda() mels.append(torch.unsqueeze(mel, 0)) mels = torch.cat(mels, 0) cond_input = model.get_cond_input(mels) audio_data = wavenet.infer(cond_input, implementation) for i, file_path in enumerate(files): file_name = os.path.splitext(os.path.basename(file_path[0]))[0] audio = utils.mu_law_decode_numpy(audio_data[i, :].cpu().numpy(), 256) print("Range of {}.wav before deemphasis : {} to {}".format( file_name, audio.min(), audio.max())) if mel_extractor.apply_preemphasis: audio = audio.astype("float32") audio = audio_processor.deemphasis(audio[None, :]) audio = audio.numpy()[0] print("Range of {}.wav after deemphasis : {} to {}".format( file_name, audio.min(), audio.max())) audio = np.tanh(audio) output_filepath = "{}.wav".format(file_name) output_filepath = os.path.join(output_dir, output_filepath) assert audio.dtype in [np.float64, np.float32] assert (np.abs(audio)).max() <= 1 writer.add_audio(output_filepath, audio, 0, 22050) audio = (audio * 32767).astype("int16") scipy.io.wavfile.write(output_filepath, 22050, audio)
def synthesis(text_input, args): local_rank = dg.parallel.Env().local_rank place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()) fluid.enable_dygraph(place) with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) # tensorboard if not os.path.exists(args.output): os.mkdir(args.output) writer = SummaryWriter(os.path.join(args.output, 'log')) model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels']) # Load parameters. global_step = io.load_parameters( model=model, checkpoint_path=args.checkpoint) model.eval() text = np.asarray(text_to_sequence(text_input)) text = np.expand_dims(text, axis=0) pos_text = np.arange(1, text.shape[1] + 1) pos_text = np.expand_dims(pos_text, axis=0) text = dg.to_variable(text).astype(np.int64) pos_text = dg.to_variable(pos_text).astype(np.int64) _, mel_output_postnet = model(text, pos_text, alpha=args.alpha) if args.vocoder == 'griffin-lim': #synthesis use griffin-lim wav = synthesis_with_griffinlim(mel_output_postnet, cfg['audio']) elif args.vocoder == 'waveflow': wav = synthesis_with_waveflow(mel_output_postnet, args, args.checkpoint_vocoder, place) else: print( 'vocoder error, we only support griffinlim and waveflow, but recevied %s.' % args.vocoder) writer.add_audio(text_input + '(' + args.vocoder + ')', wav, 0, cfg['audio']['sr']) if not os.path.exists(os.path.join(args.output, 'samples')): os.mkdir(os.path.join(args.output, 'samples')) write( os.path.join( os.path.join(args.output, 'samples'), args.vocoder + '.wav'), cfg['audio']['sr'], wav) print("Synthesis completed !!!") writer.close()
def tb_train2(): import torchvision.utils as vutils import torchvision.models as models from torchvision import datasets resnet18 = models.resnet18(False) writer = SummaryWriter() sample_rate = 44100 freqs = [262, 294, 330, 349, 392, 440, 440, 440, 440, 440, 440] for n_iter in range(100): dummy_s1 = torch.rand(1) dummy_s2 = torch.rand(1) # data grouping by `slash` writer.add_scalar('data/scalar1', dummy_s1[0], n_iter) writer.add_scalar('data/scalar2', dummy_s2[0], n_iter) writer.add_scalars('data/scalar_group', {'xsinx': n_iter * np.sin(n_iter), 'xcosx': n_iter * np.cos(n_iter), 'arctanx': np.arctan(n_iter)}, n_iter) dummy_img = torch.rand(32, 3, 64, 64) # output from network if n_iter % 10 == 0: x = vutils.make_grid(dummy_img, normalize=True, scale_each=True) writer.add_image('Image', x, n_iter) dummy_audio = torch.zeros(sample_rate * 2) for i in range(x.size(0)): # amplitude of sound should in [-1, 1] dummy_audio[i] = np.cos(freqs[n_iter // 10] * np.pi * float(i) / float(sample_rate)) writer.add_audio('myAudio', dummy_audio, n_iter, sample_rate=sample_rate) writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter) for name, param in resnet18.named_parameters(): writer.add_histogram(name, param.clone().cpu().data.numpy(), n_iter) # needs tensorboard 0.4RC or later writer.add_pr_curve('xoxo', np.random.randint(2, size=100), np.random.rand(100), n_iter) dataset = datasets.MNIST('mnist', train=False, download=True) images = dataset.test_data[:100].float() label = dataset.test_labels[:100] features = images.view(100, 784) writer.add_embedding(features, metadata=label, label_img=images.unsqueeze(1)) # export scalar data to JSON for external processing writer.export_scalars_to_json("./all_scalars.json") writer.close()
def test_dataset_loader(): from tqdm import tqdm from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler from jukebox.utils.audio_utils import audio_preprocess, audio_postprocess from jukebox.hparams import setup_hparams from jukebox.data.files_dataset import FilesAudioDataset hps = setup_hparams("teeny", {}) hps.sr = 22050 # 44100 hps.hop_length = 512 hps.labels = False hps.channels = 2 hps.aug_shift = False hps.bs = 2 hps.nworkers = 2 # Getting 20 it/s with 2 workers, 10 it/s with 1 worker print(hps) dataset = hps.dataset root = hps.root from tensorboardX import SummaryWriter sr = {22050: '22k', 44100: '44k', 48000: '48k'}[hps.sr] writer = SummaryWriter(f'{root}/{dataset}/logs/{sr}/logs') dataset = FilesAudioDataset(hps) print("Length of dataset", len(dataset)) # Torch Loader collate_fn = lambda batch: t.stack([t.from_numpy(b) for b in batch], 0) sampler = DistributedSampler(dataset) train_loader = DataLoader(dataset, batch_size=hps.bs, num_workers=hps.nworkers, pin_memory=False, sampler=sampler, drop_last=True, collate_fn=collate_fn) dist.barrier() sampler.set_epoch(0) for i, x in enumerate(tqdm(train_loader)): x = x.to('cuda', non_blocking=True) for j, aud in enumerate(x): writer.add_audio('in_' + str(i * hps.bs + j), aud, 1, hps.sr) print("Wrote in") x = audio_preprocess(x, hps) x = audio_postprocess(x, hps) for j, aud in enumerate(x): writer.add_audio('out_' + str(i * hps.bs + j), aud, 1, hps.sr) print("Wrote out") dist.barrier() break
class Logger(object): def __init__(self, logdir='./log'): self.writer = SummaryWriter(logdir) def scalar_summary(self, tag, value, step): self.writer.add_scalar(tag, value, step) def scalars_summary(self, tag, dictionary, step): self.writer.add_scalars(tag, dictionary, step) def text_summary(self, tag, value, step): self.writer.add_text(tag, value, step) def audio_summary(self, tag, value, step, sr): self.writer.add_audio(tag, value, step, sample_rate=sr)
class MetricCounter: def __init__(self, exp_name): self.writer = SummaryWriter(logdir=exp_name) self.metrics = defaultdict(list) self.best_metric = float('inf') def clear(self): self.metrics = defaultdict(list) def add_losses(self, linear, mel, total): for name, value in zip(("linear_loss", "mel_loss", "total_loss"), (linear, mel, total)): self.metrics[name].append(value) def loss_message(self): metrics = ((k, np.mean(self.metrics[k])) for k in ("linear_loss", "mel_loss", "total_loss")) return '; '.join(map(lambda x: x[0] + '=' + '%.5f' % x[1], metrics)) def write_to_tensorboard(self, epoch_num, validation=False, epoch=False): scalar_prefix = 'Validation' if validation else 'Train' epoch_prefix = "Epoch" if epoch else "Iter" for k in ("linear_loss", "mel_loss", "total_loss"): self.writer.add_scalar(tag=(scalar_prefix + epoch_prefix + '_' + k), scalar_value=np.mean(self.metrics[k]), global_step=epoch_num) def write_audio_to_tensorboard(self, exp_name, outputs, step_num, sample_rate, validation=False): self.writer.add_audio(exp_name, outputs, step_num, sample_rate=sample_rate) def update_best_model(self): cur_metric = np.mean(self.metrics['total_loss']) if cur_metric < self.best_metric: self.best_metric = cur_metric return True else: return False
class Logger(object): def __init__(self, run_dir, **kwargs): self.writer = SummaryWriter(run_dir, **kwargs) self.async_executor = ThreadPoolExecutor(max_workers=4) self.futures = dict() def add_scalar(self, name, scalar, global_step): self.writer.add_scalar(name, scalar, global_step) def add_audio(self, name, audio, global_step, sr=22050): self.writer.add_audio(name, audio, global_step, sample_rate=sr) def add_image(self, name, image, global_step): self.writer.add_image(name, image, global_step) def add_async(self, fn, cb, *args, **kwargs): future = self.async_executor.submit(fn, *args, **kwargs) self.futures[future] = cb def process_async(self): done = list(filter(lambda future: future.done(), self.futures)) for future in done: cb = self.futures[future] try: res = future.result() except TimeoutError: print('TimeoutError, no need to be too upset') else: del self.futures[future] cb(res) def close(self): self.async_executor.shutdown(wait=True) self.process_async() self.writer.close() def __enter__(self): return self def __exit__(self, exception_type, exception_value, traceback): self.close()
class Logger(object): def __init__(self): self.summary_writer = None def setup(self, *args, **kwargs): if self.summary_writer is not None: raise RuntimeError("set_up can only be called once") self.summary_writer = SummaryWriter(*args, **kwargs) def add_audio(self, *args, **kwargs): self.ensure_ready() self.summary_writer.add_audio(*args, **kwargs) def add_embedding(self, *args, **kwargs): self.ensure_ready() self.summary_writer.add_embedding(*args, **kwargs) def ensure_ready(self): if self.summary_writer is None: raise RuntimeError("set_up has not been run")
class Logger(object): def __init__(self, log_dir): self.log_dir = log_dir self.writer = SummaryWriter(log_dir) def log(self, n_iter, report, log_type="scalar", sr=None, text=False): if log_type not in LOG_TYPE: raise ("Wrong data type for logger.") if log_type == "scalar": if text: self._print_scalars(n_iter, report) for k, v in report.items(): self.writer.add_scalar("scalar/{}".format(k), v, n_iter) elif log_type == "audio": if sr is None: raise ("Sample rate is required for saving audio data.") for k, v in report.items(): self.writer.add_audio(k, v, n_iter, sample_rate=sr) elif log_type == "image": for k, v in report.items(): self.writer.add_image(k, v, n_iter) def _print_scalars(self, n_iter, report): print("---------------------------") print("n_iter : {}".format(n_iter)) for k, v in report.items(): print("{} : {:.4f}".format(k, v)) print("---------------------------") def write(self): if not os.path.exists(self.log_dir): os.mkdir(self.log_dir) self.writer.export_scalars_to_json( os.path.join(self.log_dir, "log.json")) def flush(self): self.writer.file_writer.flush() def close(self): self.writer.close()
writer.add_scalars( 'data/scalar_group', { "xsinx": n_iter * np.sin(n_iter), "xcosx": n_iter * np.cos(n_iter), "arctanx": np.arctan(n_iter) }, n_iter) x = torch.rand(32, 3, 64, 64) # output from network if n_iter % 10 == 0: x = vutils.make_grid(x, normalize=True, scale_each=True) writer.add_image('Image', x, n_iter) x = torch.zeros(sample_rate * 2) for i in range(x.size(0)): x[i] = np.cos( freqs[n_iter // 10] * np.pi * float(i) / float(sample_rate)) # sound amplitude should in [-1, 1] writer.add_audio('myAudio', x, n_iter, sample_rate=sample_rate) writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter) for name, param in vgg16.named_parameters(): writer.add_histogram(name, param.clone().cpu().data.numpy(), n_iter) writer.add_pr_curve('xoxo', np.random.randint(2, size=100), np.random.rand(100), n_iter) #needs tensorboard 0.4RC or later dataset = datasets.MNIST('mnist', train=False, download=True) images = dataset.test_data[:100].float() label = dataset.test_labels[:100] features = images.view(100, 784) writer.add_embedding(features, metadata=label, label_img=images.unsqueeze(1)) # export scalar data to JSON for external processing writer.export_scalars_to_json("./all_scalars.json")
def train_and_evaluate(dataset, hparams, logdir, checkpoint=None): log.info("Fetch model...") model = util.fetch_model(hparams) log.info("Fetch dataloader...") dataloader = util.fetch_dataloader(dataset, model, hparams) log.info("Fetch optimizer...") optimizer = util.fetch_optimizer(model, hparams) global_step = 0 best_metric = 0.0 best_model = model writer = SummaryWriter(logdir) # load model or resume from checkpoint if possible if checkpoint: state = util.load_checkpoint(checkpoint) if hparams.resume: log.info( 'Resuming training from checkpoint: {}'.format(checkpoint)) best_metric = state['best_metric'] global_step = state['global_step'] optimizer.load_state_dict(state['optim_dict']) log.info('Loading model from checkpoint: {}'.format(checkpoint)) model.load_state_dict(state['state_dict']) log.info("Start training...") run_tic = time.time() for epoch in range(hparams.num_epochs): log.info("Epoch {}/{}".format(epoch + 1, hparams.num_epochs)) global_step = train(dataloader['train'], model, optimizer, global_step, hparams, writer) metric = val(dataloader['val'], model, global_step, hparams, writer) is_best = False if metric >= best_metric: log.info('Found new best! Metric: {}'.format(metric)) is_best = True best_metric = metric best_model = model # Save weights log.info('Saving checkpoint at global step {}'.format(global_step)) util.save_checkpoint( { 'global_step': global_step, 'best_metric': best_metric, 'metric': metric, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, is_best=is_best, checkpoint=logdir) run_tac = time.time() log.info("Generating a sample sound with the best model...") gen_tic = time.time() audio = best_model.generate(hparams.sample_size) gen_tac = time.time() # write audio to the tensorboard log.info("{} epochs with batchsize:{}, total time passed:{} ".format( hparams.num_epochs, hparams.batch_size, run_tac - run_tic)) log.info("Sample size: {}, generation time: {}".format( hparams.sample_size, gen_tac - gen_tic)) writer.add_audio('final/wav', audio, global_step, hparams.sample_rate)
"xcosx": n_iter * np.cos(n_iter), "arctanx": np.arctan(n_iter) }, n_iter) x = torch.rand(32, 3, 64, 64) # output from network if n_iter % 10 == 0: x = vutils.make_grid(x, normalize=True, scale_each=True) writer.add_image('Image', x, n_iter) # Tensor # writer.add_image('astronaut', skimage.data.astronaut(), n_iter) # numpy # writer.add_image('imread', # skimage.io.imread('screenshots/audio.png'), n_iter) # numpy x = torch.zeros(sample_rate * 2) for i in range(x.size(0)): # sound amplitude should in [-1, 1] x[i] = np.cos(freqs[n_iter // 10] * np.pi * float(i) / float(sample_rate)) writer.add_audio('myAudio', x, n_iter) writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter) writer.add_text('markdown Text', '''a|b\n-|-\nc|d''', n_iter) for name, param in resnet18.named_parameters(): if 'bn' not in name: writer.add_histogram(name, param, n_iter) writer.add_pr_curve('xoxo', np.random.randint(2, size=100), np.random.rand(100), n_iter) # needs tensorboard 0.4RC or later writer.add_pr_curve_raw('prcurve with raw data', true_positive_counts, false_positive_counts, true_negative_counts, false_negative_counts, precision, recall, n_iter) # export scalar data to JSON for external processing writer.export_scalars_to_json( # 在 PyTorch_Tutorial/Code 目录下运行
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, waveglow_config, dist_config, data_config, train_config, **kwargs): torch.manual_seed(seed) torch.cuda.manual_seed(seed) # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) # =====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() # =====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) # =====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 iteration_start = iteration trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader( trainset, num_workers=train_config.get('dataloader_num_workers', 8), shuffle=train_config.get('dataloader_shuffle', True), sampler=train_sampler, batch_size=batch_size, pin_memory=train_config.get('dataloader_pin_memory', False), drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(os.path.dirname(output_directory), 'tensorboard'), filename_suffix='.tensorboard') with open(Path(output_directory).parent.joinpath('metadata', 'train.txt'), 'wt', encoding='utf8') as fout: for line in trainset.audio_files: fpath = os.path.abspath(line) fout.write(f'{fpath}\n') model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate( tqdm(train_loader, desc=f"Epoch-{epoch}", ncols=100)): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() # print("{}:\t{:.9f}".format(iteration, reduced_loss)) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if (iteration % iters_per_checkpoint == 0) or (iteration == iteration_start): if rank == 0: checkpoint_path = "{}/waveglow-{:06d}.pt".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path, waveglow_config=waveglow_config) info_path = os.path.join(output_directory, 'info.yml') checkpoint_info = { 'name': os.path.basename(checkpoint_path), 'iteration': iteration, 'loss': reduced_loss } keep_n_checkpoints(info_path, checkpoint_info, 5) if with_tensorboard: # outputs[0].shape: torch.Size([1, 8, 1000]) with torch.no_grad(): d = model.infer(mel.data[0].unsqueeze(0), sigma=sigma) d = d.cpu().squeeze() pred_audio = (d - d.min()) * 1.98 / ( d.max() - d.min()) - 0.99 logger.add_audio( "generated/iteration-{}.wav".format(iteration), pred_audio, iteration, sample_rate=trainset.sampling_rate, ) true_audio = audio.data[0].squeeze() logger.add_audio( "original/iteration-{}.wav".format(iteration), true_audio, iteration, sample_rate=trainset.sampling_rate, ) # 查看频谱,直观了解生成语音的情况 mel_output = trainset.get_mel(pred_audio.cpu()) logger.add_image( "generated/iteration-{}.png".format(iteration), plot_spectrogram_to_numpy( mel_output.data.cpu().numpy()), iteration, dataformats='HWC') mel_input = mel.data[0] logger.add_image( "original/iteration-{}.png".format(iteration), plot_spectrogram_to_numpy( mel_input.data.cpu().numpy()), iteration, dataformats='HWC') iteration += 1
class Trainer: """ Generalized training helper class. This class focuses remove repetitive sources in general training pipeline. And almost things has similar patterns to train some models, but, in major, forwarding process is mainly different in most cases. So, if engineer extends this class as their own cases, he/she just override forward function. Args: model: a main model to be saved and to be forwarded optimizer: optimizer module train_dataset: dataset on train phase valid_dataset: dataset on validation phase max_step: maximum iteration step valid_max_step: maximum iteration steps on each validation time. save_interval: save and validate interval (in iteration) log_interval: log interval (in iteration) save_dir: base directory to save checkpoints and logs save_prefix: a prefix to categorize each experiment grad_clip: scalars to clamp gradients grad_norm: maximum norm of gradients to be clipped pretrained_path: specific file path of checkpoint sr: sampling rate scheduler: learning rate scheduler Examples:: class MyTrainer(Trainer): def forward(self, input: torch.tensor, target: torch.tensor, is_logging: bool): # forward model out = self.model(input) # calc your own loss loss = calc_loss(out, target) # build meta for logging meta = { 'loss': (loss.item(), LogType.SCALAR), 'out': (out[0], LogType.PLOT) } return loss, meta """ def __init__(self, model: nn.Module, optimizer: torch.optim.Optimizer, train_dataset, valid_dataset, max_step: int, valid_max_step: int, save_interval: int, log_interval: int, save_dir: str, save_prefix: str = 'save', grad_clip: float = 0.0, grad_norm: float = 0.0, pretrained_path: str = None, sr: int = None, scheduler: torch.optim.lr_scheduler._LRScheduler = None): # save project info self.pretrained_trained = pretrained_path # model self.model = model self.optimizer = optimizer self.scheduler = scheduler # log how many parameters in the model n_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad) log('Model {} was loaded. Total {} params.'.format( self.model.__class__.__name__, n_params)) # adopt repeating function on datasets self.train_dataset = self.repeat(train_dataset) self.valid_dataset = self.repeat(valid_dataset) # save parameters self.step = 0 if sr: self.sr = sr else: self.sr = SAMPLE_RATE self.max_step = max_step self.save_interval = save_interval self.log_interval = log_interval self.save_dir = save_dir self.save_prefix = save_prefix self.grad_clip = grad_clip self.grad_norm = grad_norm self.valid_max_step = valid_max_step # make dirs self.log_dir = os.path.join(save_dir, 'logs', self.save_prefix) self.model_dir = os.path.join(save_dir, 'models') os.makedirs(self.log_dir, exist_ok=True) os.makedirs(self.model_dir, exist_ok=True) self.writer = SummaryWriter(log_dir=self.log_dir, flush_secs=10) # load previous checkpoint # set seed self.seed = None self.load() if not self.seed: self.seed = np.random.randint(np.iinfo(np.int32).max) np.random.seed(self.seed) torch.manual_seed(self.seed) torch.cuda.manual_seed(self.seed) # load pretrained model if self.step == 0 and pretrained_path: self.load_pretrained_model() # valid loss self.best_valid_loss = np.finfo(np.float32).max self.cur_best_valid_loss = self.best_valid_loss self.save_valid_loss = np.finfo(np.float32).max @abc.abstractmethod def forward(self, *inputs, is_logging: bool = False) -> Tuple[torch.Tensor, Dict]: """ :param inputs: Loaded Data Points from Speech Loader :param is_logging: log or not :return: Loss Tensor, Log Dictionary """ raise NotImplemented def run(self) -> float: try: # training loop for i in range(self.step + 1, self.max_step + 1): # update step self.step = i # logging if i % self.save_interval == 1: log('------------- TRAIN step : %d -------------' % i) # do training step if self.scheduler is not None: self.scheduler.step(i) self.model.train() self.train(i) # save model if i % self.save_interval == 0: log('------------- VALID step : %d -------------' % i) # valid self.model.eval() self.validate(i) # save model checkpoint file self.save(i) except KeyboardInterrupt: log('Train is canceled !!') return self.best_valid_loss def clip_grad(self): if self.grad_clip: for p in self.model.parameters(): if p.grad is not None: p.grad = p.grad.clamp(-self.grad_clip, self.grad_clip) if self.grad_norm: torch.nn.utils.clip_grad_norm_( [p for p in self.model.parameters() if p.requires_grad], self.grad_norm) def train(self, step: int) -> torch.Tensor: # update model self.optimizer.zero_grad() # flag for logging log_flag = step % self.log_interval == 0 # forward model loss, meta = self.forward(*to_device(next(self.train_dataset)), log_flag) # check loss nan if loss != loss: log('{} cur step NAN is occured'.format(step)) return loss.backward() self.clip_grad() self.optimizer.step() # logging if log_flag: # console logging self.console_log('train', meta, step) # tensorboard logging self.tensorboard_log('train', meta, step) def validate(self, step: int): loss = 0. stat = defaultdict(float) for i in range(self.valid_max_step): # forward model with torch.no_grad(): batch_loss, meta = self.forward( *to_device(next(self.valid_dataset)), True) loss += batch_loss # update stat for key, (value, log_type) in meta.items(): if log_type == LogType.SCALAR: stat[key] += value # console logging of this step if (i + 1) % self.log_interval == 0: self.console_log('valid', meta, i + 1) meta_non_scalar = { key: (value, log_type) for key, (value, log_type) in meta.items() if not log_type == LogType.SCALAR } self.tensorboard_log('valid', meta_non_scalar, step) # averaging stat loss /= self.valid_max_step for key in stat.keys(): stat[key] = stat[key] / self.valid_max_step # update best valid loss if loss < self.best_valid_loss: self.best_valid_loss = loss # console logging of total stat msg = 'step {} / total stat'.format(step) for key, value in sorted(stat.items()): msg += '\t{}: {:.6f}'.format(key, value) log(msg) # tensor board logging of scalar stat for key, value in stat.items(): self.writer.add_scalar('valid/{}'.format(key), value, global_step=step) @property def save_name(self): if isinstance(self.model, nn.parallel.DataParallel): module = self.model.module else: module = self.model return self.save_prefix + '/' + module.__class__.__name__ def load(self, load_optim: bool = True): # make name save_name = self.save_name # save path save_path = os.path.join(self.model_dir, save_name) # get latest file check_files = glob.glob(os.path.join(save_path, '*')) if check_files: # load latest state dict latest_file = max(check_files, key=os.path.getctime) state_dict = torch.load(latest_file) if 'seed' in state_dict: self.seed = state_dict['seed'] # load model if isinstance(self.model, nn.DataParallel): self.model.module.load_state_dict( get_loadable_checkpoint(state_dict['model'])) else: self.model.load_state_dict( get_loadable_checkpoint(state_dict['model'])) if load_optim: self.optimizer.load_state_dict(state_dict['optim']) if self.scheduler is not None: self.scheduler.load_state_dict(state_dict['scheduler']) self.step = state_dict['step'] log('checkpoint \'{}\' is loaded. previous step={}'.format( latest_file, self.step)) else: log('No any checkpoint in {}. Loading network skipped.'.format( save_path)) def save(self, step: int): # state dict state_dict = get_loadable_checkpoint(self.model.state_dict()) # train state_dict = { 'step': step, 'model': state_dict, 'optim': self.optimizer.state_dict(), 'pretrained_step': step, 'seed': self.seed } if self.scheduler is not None: state_dict.update({'scheduler': self.scheduler.state_dict()}) # save for training save_name = self.save_name save_path = os.path.join(self.model_dir, save_name) os.makedirs(save_path, exist_ok=True) torch.save(state_dict, os.path.join(save_path, 'step_{:06d}.chkpt'.format(step))) # save best if self.best_valid_loss != self.cur_best_valid_loss: save_path = os.path.join(self.model_dir, save_name + '.best.chkpt') torch.save(state_dict, save_path) self.cur_best_valid_loss = self.best_valid_loss # logging log('step %d / saved model.' % step) def load_pretrained_model(self): assert os.path.exists( self.pretrained_trained), 'You must define pretrained path!' self.model.load_state_dict( get_loadable_checkpoint( torch.load(self.pretrained_trained)['model'])) def console_log(self, tag: str, meta: Dict[str, Any], step: int): # console logging msg = '{}\t{:06d} it'.format(tag, step) for key, (value, log_type) in sorted(meta.items()): if log_type == LogType.SCALAR: msg += '\t{}: {:.6f}'.format(key, value) log(msg) def tensorboard_log(self, tag: str, meta: Dict[str, Any], step: int): for key, (value, log_type) in meta.items(): if log_type == LogType.IMAGE: self.writer.add_image('{}/{}'.format(tag, key), imshow_to_buf(to_numpy(value)), global_step=step) elif log_type == LogType.AUDIO: self.writer.add_audio('{}/{}'.format(tag, key), to_numpy(value), global_step=step, sample_rate=self.sr) elif log_type == LogType.SCALAR: self.writer.add_scalar('{}/{}'.format(tag, key), value, global_step=step) elif log_type == LogType.PLOT: self.writer.add_image('{}/{}'.format(tag, key), plot_to_buf(to_numpy(value)), global_step=step) @staticmethod def repeat(iterable): while True: for x in iterable: yield x
writer.add_scalar('data/scalar2', dummy_s2[0], n_iter) writer.add_scalars('data/scalar_group', {'xsinx': n_iter * np.sin(n_iter), 'xcosx': n_iter * np.cos(n_iter), 'arctanx': np.arctan(n_iter)}, n_iter) dummy_img = torch.rand(32, 3, 64, 64) # output from network if n_iter % 10 == 0: x = vutils.make_grid(dummy_img, normalize=True, scale_each=True) writer.add_image('Image', x, n_iter) dummy_audio = torch.zeros(sample_rate * 2) for i in range(x.size(0)): # amplitude of sound should in [-1, 1] dummy_audio[i] = np.cos(freqs[n_iter // 10] * np.pi * float(i) / float(sample_rate)) writer.add_audio('myAudio', dummy_audio, n_iter, sample_rate=sample_rate) writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter) for name, param in resnet18.named_parameters(): writer.add_histogram(name, param.clone().cpu().data.numpy(), n_iter) # needs tensorboard 0.4RC or later writer.add_pr_curve('xoxo', np.random.randint(2, size=100), np.random.rand(100), n_iter) dataset = datasets.MNIST('mnist', train=False, download=True) images = dataset.test_data[:100].float() label = dataset.test_labels[:100] features = images.view(100, 784) writer.add_embedding(features, metadata=label, label_img=images.unsqueeze(1))
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard, num_workers=2): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 # HACK: setup separate training and eval sets training_files = data_config['training_files'] eval_files = data_config['eval_files'] del data_config['training_files'] del data_config['eval_files'] data_config['audio_files'] = training_files trainset = Mel2Samp(**data_config) data_config['audio_files'] = eval_files evalset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None eval_sampler = DistributedSampler(evalset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== print("Creating dataloaders with " + str(num_workers) + " workers") train_loader = DataLoader(trainset, num_workers=num_workers, shuffle=True, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) eval_loader = DataLoader(evalset, num_workers=num_workers, shuffle=True, sampler=eval_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger_train = SummaryWriter( os.path.join(output_directory, 'logs', 'train')) logger_eval = SummaryWriter( os.path.join(output_directory, 'logs', 'eval')) epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): model.train() with tqdm(total=len(train_loader)) as train_pbar: for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() train_pbar.set_description( "Epoch {} Iter {} Loss {:.3f}".format( epoch, iteration, reduced_loss)) if with_tensorboard and rank == 0 and iteration % 10 == 0: logger_train.add_scalar('loss', reduced_loss, i + len(train_loader) * epoch) # adding logging for GPU utilization and memory usage gpu_memory_used, gpu_utilization = get_gpu_stats() k = 'gpu' + str(0) logger_train.add_scalar(k + '/memory', gpu_memory_used, iteration) logger_train.add_scalar(k + '/load', gpu_utilization, iteration) logger_train.flush() if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1 train_pbar.update(1) # Eval model.eval() torch.cuda.empty_cache() with torch.no_grad(): tensorboard_mel, tensorboard_audio = None, None loss_accum = [] with tqdm(total=len(eval_loader)) as eval_pbar: for i, batch in enumerate(eval_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs).item() loss_accum.append(loss) eval_pbar.set_description("Epoch {} Eval {:.3f}".format( epoch, loss)) outputs = None # use the first batch for tensorboard audio samples if i == 0: tensorboard_mel = mel tensorboard_audio = audio eval_pbar.update(1) if with_tensorboard and rank == 0: loss_avg = statistics.mean(loss_accum) tqdm.write("Epoch {} Eval AVG {}".format(epoch, loss_avg)) logger_eval.add_scalar('loss', loss_avg, iteration) # log audio samples to tensorboard tensorboard_audio_generated = model.infer(tensorboard_mel) for i in range(0, 5): ta = tensorboard_audio[i].cpu().numpy() tag = tensorboard_audio_generated[i].cpu().numpy() logger_eval.add_audio("sample " + str(i) + "/orig", ta, epoch, sample_rate=data_config['sampling_rate']) logger_eval.add_audio("sample " + str(i) + "/gen", tag, epoch, sample_rate=data_config['sampling_rate']) logger_eval.flush()
"Generative adversarial network or variational auto-encoder.", "Please call Stella.", "Some have accepted this as a miracle without any physical explanation.", ] for idx, sent in enumerate(sentences): wav, attn = eval_model( dv3, sent, replace_pronounciation_prob, min_level_db, ref_level_db, power, n_iter, win_length, hop_length, preemphasis) wav_path = os.path.join( state_dir, "waveform", "eval_sample_{:09d}.wav".format(global_step)) sf.write(wav_path, wav, sample_rate) writer.add_audio( "eval_sample_{}".format(idx), wav, global_step, sample_rate=sample_rate) attn_path = os.path.join( state_dir, "alignments", "eval_sample_attn_{:09d}.png".format(global_step)) plot_alignment(attn, attn_path) writer.add_image( "eval_sample_attn{}".format(idx), cm.viridis(attn), global_step, dataformats="HWC") # save checkpoint if global_step % save_interval == 0: io.save_parameters(ckpt_dir, global_step, dv3, optim)
def train(args, hp, hp_str, logger, vocoder): os.makedirs(os.path.join(hp.train.chkpt_dir, args.name), exist_ok=True) os.makedirs(os.path.join(args.outdir, args.name), exist_ok=True) os.makedirs(os.path.join(args.outdir, args.name, "assets"), exist_ok=True) device = torch.device("cuda" if hp.train.ngpu > 0 else "cpu") dataloader = loader.get_tts_dataset(hp.data.data_dir, hp.train.batch_size, hp) validloader = loader.get_tts_dataset(hp.data.data_dir, 1, hp, True) idim = len(valid_symbols) odim = hp.audio.num_mels model = fastspeech.FeedForwardTransformer(idim, odim, hp) # set torch device model = model.to(device) print("Model is loaded ...") githash = get_commit_hash() if args.checkpoint_path is not None: if os.path.exists(args.checkpoint_path): logger.info("Resuming from checkpoint: %s" % args.checkpoint_path) checkpoint = torch.load(args.checkpoint_path) model.load_state_dict(checkpoint["model"]) optimizer = get_std_opt( model, hp.model.adim, hp.model.transformer_warmup_steps, hp.model.transformer_lr, ) optimizer.load_state_dict(checkpoint["optim"]) global_step = checkpoint["step"] if hp_str != checkpoint["hp_str"]: logger.warning( "New hparams is different from checkpoint. Will use new.") if githash != checkpoint["githash"]: logger.warning( "Code might be different: git hash is different.") logger.warning("%s -> %s" % (checkpoint["githash"], githash)) else: print("Checkpoint does not exixts") global_step = 0 return None else: print("New Training") global_step = 0 optimizer = get_std_opt( model, hp.model.adim, hp.model.transformer_warmup_steps, hp.model.transformer_lr, ) print("Batch Size :", hp.train.batch_size) num_params(model) os.makedirs(os.path.join(hp.train.log_dir, args.name), exist_ok=True) writer = SummaryWriter(os.path.join(hp.train.log_dir, args.name)) model.train() forward_count = 0 # print(model) for epoch in range(hp.train.epochs): start = time.time() running_loss = 0 j = 0 pbar = tqdm.tqdm(dataloader, desc="Loading train data") for data in pbar: global_step += 1 x, input_length, y, _, out_length, _, dur, e, p = data # x : [batch , num_char], input_length : [batch], y : [batch, T_in, num_mel] # # stop_token : [batch, T_in], out_length : [batch] loss, report_dict = model( x.cuda(), input_length.cuda(), y.cuda(), out_length.cuda(), dur.cuda(), e.cuda(), p.cuda(), ) loss = loss.mean() / hp.train.accum_grad running_loss += loss.item() loss.backward() # update parameters forward_count += 1 j = j + 1 if forward_count != hp.train.accum_grad: continue forward_count = 0 step = global_step # compute the gradient norm to check if it is normal or not grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), hp.train.grad_clip) logging.debug("grad norm={}".format(grad_norm)) if math.isnan(grad_norm): logging.warning("grad norm is nan. Do not update model.") else: optimizer.step() optimizer.zero_grad() if step % hp.train.summary_interval == 0: pbar.set_description( "Average Loss %.04f Loss %.04f | step %d" % (running_loss / j, loss.item(), step)) for r in report_dict: for k, v in r.items(): if k is not None and v is not None: if "cupy" in str(type(v)): v = v.get() if "cupy" in str(type(k)): k = k.get() writer.add_scalar("main/{}".format(k), v, step) if step % hp.train.validation_step == 0: for valid in validloader: x_, input_length_, y_, _, out_length_, ids_, dur_, e_, p_ = valid model.eval() with torch.no_grad(): loss_, report_dict_ = model( x_.cuda(), input_length_.cuda(), y_.cuda(), out_length_.cuda(), dur_.cuda(), e_.cuda(), p_.cuda(), ) mels_ = model.inference(x_[-1].cuda()) # [T, num_mel] model.train() for r in report_dict_: for k, v in r.items(): if k is not None and v is not None: if "cupy" in str(type(v)): v = v.get() if "cupy" in str(type(k)): k = k.get() writer.add_scalar("validation/{}".format(k), v, step) mels_ = mels_.T # Out: [num_mels, T] writer.add_image( "melspectrogram_target_{}".format(ids_[-1]), plot_spectrogram_to_numpy( y_[-1].T.data.cpu().numpy()[:, :out_length_[-1]]), step, dataformats="HWC", ) writer.add_image( "melspectrogram_prediction_{}".format(ids_[-1]), plot_spectrogram_to_numpy(mels_.data.cpu().numpy()), step, dataformats="HWC", ) # print(mels.unsqueeze(0).shape) audio = generate_audio( mels_.unsqueeze(0), vocoder ) # selecting the last data point to match mel generated above audio = audio.cpu().float().numpy() audio = audio / (audio.max() - audio.min() ) # get values between -1 and 1 writer.add_audio( tag="generated_audio_{}".format(ids_[-1]), snd_tensor=torch.Tensor(audio), global_step=step, sample_rate=hp.audio.sample_rate, ) _, target = read_wav_np( hp.data.wav_dir + f"{ids_[-1]}.wav", sample_rate=hp.audio.sample_rate, ) writer.add_audio( tag=" target_audio_{}".format(ids_[-1]), snd_tensor=torch.Tensor(target), global_step=step, sample_rate=hp.audio.sample_rate, ) ## if step % hp.train.save_interval == 0: avg_p, avg_e, avg_d = evaluate(hp, validloader, model) writer.add_scalar("evaluation/Pitch Loss", avg_p, step) writer.add_scalar("evaluation/Energy Loss", avg_e, step) writer.add_scalar("evaluation/Dur Loss", avg_d, step) save_path = os.path.join( hp.train.chkpt_dir, args.name, "{}_fastspeech_{}_{}k_steps.pyt".format( args.name, githash, step // 1000), ) torch.save( { "model": model.state_dict(), "optim": optimizer.state_dict(), "step": step, "hp_str": hp_str, "githash": githash, }, save_path, ) logger.info("Saved checkpoint to: %s" % save_path) print("Time taken for epoch {} is {} sec\n".format( epoch + 1, int(time.time() - start)))
writer.add_scalar('data/scalar2', dummy_s2[0], n_iter) writer.add_scalars('data/scalar_group', {'xsinx': n_iter * np.sin(n_iter), 'xcosx': n_iter * np.cos(n_iter), 'arctanx': np.arctan(n_iter)}, n_iter) dummy_img = torch.rand(32, 3, 64, 64) # output from network if n_iter % 10 == 0: x = vutils.make_grid(dummy_img, normalize=True, scale_each=True) writer.add_image('Image', x, n_iter) # 后面的覆盖前面的 dummy_audio = torch.zeros(sample_rate * 2) for i in range(x.size(0)): # amplitude of sound should in [-1, 1] dummy_audio[i] = np.cos(freqs[n_iter // 10] * np.pi * float(i) / float(sample_rate)) writer.add_audio('myAudio', dummy_audio, n_iter, sample_rate=sample_rate) # 后面的覆盖前面的 writer.add_text('Text', 'text logged at step:' + str(n_iter), n_iter) # 后面没有覆盖前面,why ? # 文本就不会覆盖??? for name, param in resnet18.named_parameters(): writer.add_histogram(name, param.clone().cpu().data.numpy(), n_iter) # 把每个iter的 参数全部保存了,因为没有重名! # 同时默认创建 distribution 视图 # needs tensorboard 0.4RC or later writer.add_pr_curve('xoxo', np.random.randint(2, size=100), np.random.rand(100), n_iter) # 好像后面也没有覆盖前面的!why ? dataset = datasets.MNIST('mnist', train=False, download=True) images = dataset.test_data[:100].float()
class PPTS_Solver(Solver): def __init__(self, config, args, mode='train'): super(PPTS_Solver, self).__init__(config, args) self.phn_hat_dir = config['path']['ppr']['output_dir'] self.phn_dim = config['text']['phn_dim'] self.n_fft = config['audio']['n_fft'] self.lr = config['model']['ppts']['lr'] self.optimizer_type = config['model']['ppts']['type'] self.betas = [float(x) for x in config['model']['ppts']['betas'].split(',')] self.weight_decay = config['model']['ppts']['weight_decay'] self.spk_id = args.spk_id if self.spk_id == ('' or None): print("[Error] A spk_id must be given to init a PPTS solver") exit() self.mode = mode self.model, self.criterion = self.build_model() if mode == 'train': self.optimizer = self.build_optimizer() self.train_loader = self.get_dataset(self.train_meta_path) self.eval_loader = self.get_dataset(self.eval_meta_path) self.log_dir = os.path.join(config['path']['ppts']['log_dir'], self.spk_id) self.writer = SummaryWriter(self.log_dir) elif mode == 'test': self.test_loader = self.get_dataset(self.test_meta_path) self.save_dir = os.path.join(config['path']['ppts']['save_dir'], self.spk_id) self.ppts_output_dir = os.path.join(config['path']['ppts']['output_dir'], self.spk_id) # attempt to load or set gs and epoch to 0 self.load_ckpt() def get_dataset(self, meta_path): dataset = PPTS_VCTKDataset( feat_dir=self.feat_dir, meta_path=meta_path, dict_path=self.dict_path, phn_hat_dir=self.phn_hat_dir, spk_id = self.spk_id, mode=self.mode ) dataloader = DataLoader( dataset, batch_size=self.batch_size, shuffle=True if self.mode == 'train' else False, num_workers=self.num_workers, collate_fn=dataset._collate_fn, pin_memory=True ) return dataloader def build_model(self): ppts = PPTS( input_dim=self.phn_dim, output_dim=(self.n_fft//2)+1, dropout_rate=0.5, prenet_hidden_dims=[256, 128], K=16, conv1d_bank_hidden_dim=128, conv1d_projections_hidden_dim=256, gru_dim=256 ) ppts = ppts.to(self.device) criterion = torch.nn.L1Loss() return ppts, criterion def build_optimizer(self): optimizer = getattr(torch.optim, self.optimizer_type) optimizer = optimizer( self.model.parameters(), lr=self.lr, betas=self.betas, weight_decay=self.weight_decay ) return optimizer def save_ckpt(self): if not os.path.exists(self.save_dir): os.makedirs(self.save_dir) checkpoint_path = os.path.join( self.save_dir, "model.ckpt-{}.pt".format(self.global_step) ) torch.save({ "model": self.model.state_dict(), "optimizer": self.optimizer.state_dict(), "global_step": self.global_step, "epoch": self.epoch }, checkpoint_path) print("Checkpoint model.ckpt-{}.pt saved.".format(self.global_step)) with open(os.path.join(self.save_dir, "checkpoint"), "w") as f: f.write("model.ckpt-{}".format(self.global_step)) return def load_ckpt(self): checkpoint_list = os.path.join(self.save_dir, 'checkpoint') if os.path.exists(checkpoint_list): checkpoint_filename = open(checkpoint_list).readline().strip() checkpoint_path = os.path.join(self.save_dir, "{}.pt".format(checkpoint_filename)) if self.use_gpu: checkpoint = torch.load(checkpoint_path) else: checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu')) self.model.load_state_dict(checkpoint['model']) if self.mode == 'train': self.optimizer.load_state_dict(checkpoint['optimizer']) self.global_step = checkpoint['global_step'] self.epoch = checkpoint['epoch'] print("Checkpoint model.ckpt-{}.pt loaded.".format(self.global_step)) else: self.global_step = 0 self.epoch = 0 print("Start training with new parameters.") return def train(self): epoch_loss = 0.0 self.model.train() for idx, (_, phn_hat_batch, mag_batch) in enumerate(self.train_loader): phn_hat_batch, mag_batch = phn_hat_batch.to(self.device), mag_batch.to(self.device) # Forward self.optimizer.zero_grad() mag_hat = self.model(phn_hat_batch) #loss = self.criterion(mag_hat, mag_batch) loss = 0.5 * self.criterion(mag_hat, mag_batch) + \ 0.5 * self.criterion(mag_hat[:,:,:200], mag_batch[:,:,:200]) epoch_loss += loss.item() # Logging # Because of number of batch is too few, only log at epoch level ''' if self.global_step % self.log_interval == 0: print( '[GS=%3d, epoch=%d, idx=%3d] loss: %.6f' % \ (self.global_step+1, self.epoch+1, idx+1, loss.item()) ) if self.global_step % self.summ_interval == 0: self.writer.add_scalar('train/training_loss', loss.item(), self.global_step) ''' # Backward loss.backward() self.optimizer.step() # Saving or not self.global_step += 1 if self.global_step % self.ckpt_interval == 0: self.save_ckpt() epoch_loss /= (idx+1) print('[epoch %d] training_loss: %.6f' % (self.epoch, epoch_loss)) self.writer.add_scalar('train/epoch_training_loss', epoch_loss, self.epoch) self.writer.add_image('train/phn_hat', torch.t(phn_hat_batch[0]).detach().cpu().numpy(), self.epoch, dataformats='HW' ) self.writer.add_image( 'train/mag_gt', torch.t(mag_batch[0]).detach().cpu().numpy()[::-1,:], self.epoch, dataformats='HW' ) self.writer.add_image( 'train/mag_hat', torch.t(mag_hat[0]).detach().cpu().numpy()[::-1,:], self.epoch, dataformats='HW' ) self.writer.add_audio( 'train/audio_gt', self.ap.inv_spectrogram(mag_batch[0].detach().cpu().numpy()), self.epoch, sample_rate=self.ap.sr ) self.writer.add_audio( 'train/audio_hat', self.ap.inv_spectrogram(mag_hat[0].detach().cpu().numpy()), self.epoch, sample_rate=self.ap.sr ) self.epoch += 1 return def eval(self): eval_loss = 0.0 self.model.eval() with torch.no_grad(): for idx, (_, phn_hat_batch, mag_batch) in enumerate(self.eval_loader): phn_hat_batch, mag_batch = phn_hat_batch.to(self.device), mag_batch.to(self.device) mag_hat = self.model(phn_hat_batch) loss = self.criterion(mag_hat, mag_batch) eval_loss += loss.item() if idx % 100 == 0: break eval_loss /= (idx+1) print('[eval %d] eval_loss: %.6f' % (self.epoch, eval_loss)) self.writer.add_scalar('eval/eval_loss', eval_loss, self.epoch) self.writer.add_image( 'eval/mag_gt', torch.t(mag_batch[0]).detach().cpu().numpy()[::-1,:], self.epoch, dataformats='HW' ) self.writer.add_image( 'eval/mag_hat', torch.t(mag_hat[0]).detach().cpu().numpy()[::-1,:], self.epoch, dataformats='HW' ) self.writer.add_audio( 'eval/audio_gt', self.ap.inv_spectrogram(mag_batch[0].detach().cpu().numpy()), self.epoch, sample_rate=self.ap.sr ) self.writer.add_audio( 'eval/audio_hat', self.ap.inv_spectrogram(mag_hat[0].detach().cpu().numpy()), self.epoch, sample_rate=self.ap.sr ) return
class Runner(): ''' Handler for complete pre-training progress of upstream models ''' def __init__(self, args, config, dataloader, ckpdir): self.device = torch.device('cuda') if ( args.gpu and torch.cuda.is_available()) else torch.device('cpu') if torch.cuda.is_available(): print('[Runner] - CUDA is available!') self.model_kept = [] self.global_step = 1 self.log = SummaryWriter(ckpdir) self.args = args self.config = config self.dataloader = dataloader self.ckpdir = ckpdir # optimizer self.learning_rate = float(config['optimizer']['learning_rate']) self.warmup_proportion = config['optimizer']['warmup_proportion'] self.gradient_accumulation_steps = config['optimizer'][ 'gradient_accumulation_steps'] self.gradient_clipping = config['optimizer']['gradient_clipping'] # Training details self.apex = config['runner']['apex'] self.total_steps = config['runner']['total_steps'] self.log_step = config['runner']['log_step'] self.save_step = config['runner']['save_step'] self.duo_feature = config['runner']['duo_feature'] self.max_keep = config['runner']['max_keep'] # model self.transformer_config = config['transformer'] self.dr = config['transformer']['downsample_rate'] self.dual_transformer = config['transformer'][ 'dual_transformer'] if 'dual_transformer' in config[ 'transformer'] else False self.wave_transformer = config['transformer'][ 'wave_transformer'] if 'wave_transformer' in config[ 'transformer'] else False if 'online' in config: print(f'[Runner] - Using features extracted on-the-fly') self.input_dim, self.output_dim = [ feat.size(-1) for feat in self.dataloader.dataset.preprocessor() ] else: if self.wave_transformer: raise ValueError( 'Wave transformer needs to be run with online feature extraction!' ) print(f'[Runner] - Using features pre-extracted and saved') self.input_dim = self.transformer_config['input_dim'] self.output_dim = 1025 if self.duo_feature else None # output dim is the same as input dim if not using duo features def set_model(self): # build the Transformer model with speech prediction head if self.dual_transformer: print('[Runner] - Initializing Dual Transformer model...') model_config = DualTransformerConfig(self.config) self.model = DualTransformerForMaskedAcousticModel( model_config, self.input_dim, self.output_dim).to(self.device) else: if self.wave_transformer: print('[Runner] - Initializing Wave Transformer model...') else: print('[Runner] - Initializing Transformer model...') model_config = TransformerConfig(self.config) self.model = TransformerForMaskedAcousticModel( model_config, self.input_dim, self.output_dim).to(self.device) self.model.train() if self.args.multi_gpu: self.model = torch.nn.DataParallel(self.model) print('[Runner] - Multi-GPU training Enabled: ' + str(torch.cuda.device_count())) print('[Runner] - Number of parameters: ' + str( sum(p.numel() for p in self.model.parameters() if p.requires_grad))) # Setup optimizer param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if 'type' not in self.config['optimizer']: self.config['optimizer']['type'] = 'adam' print('[Runner] - Optimizer: ' + ('apex Fused Adam' if self. apex else str(self.config['optimizer']['type']))) if self.apex: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=self.learning_rate, bias_correction=False, max_grad_norm=1.0) if self.config['optimizer']['loss_scale'] == 0: self.optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: self.optimizer = FP16_Optimizer( optimizer, static_loss_scale=self.config['optimizer']['loss_scale']) self.warmup_linear = WarmupLinearSchedule( warmup=self.warmup_proportion, t_total=self.total_steps) elif self.config['optimizer']['type'] == 'adam': self.optimizer = BertAdam(optimizer_grouped_parameters, lr=self.learning_rate, warmup=self.warmup_proportion, t_total=self.total_steps, schedule='warmup_linear') elif self.config['optimizer']['type'] == 'lamb' or self.config[ 'optimizer']['type'] == 'adamW': self.optimizer = Lamb( optimizer_grouped_parameters, lr=self.learning_rate, warmup=self.warmup_proportion, t_total=self.total_steps, schedule='warmup_linear', adam=True if self.config['optimizer']['type'] == 'adamW' else False, correct_bias=True if self.config['optimizer']['type'] == 'adamW' else False) else: raise NotImplementedError() def save_model(self, name='states', to_path=None): if self.dual_transformer: all_states = { 'SpecHead': self.model.SpecHead.state_dict() if not self.args.multi_gpu else self.model.module.SpecHead.state_dict() } if hasattr(self.model, 'SpecTransformer'): all_states['SpecTransformer'] = self.model.SpecTransformer.state_dict( ) if not self.args.multi_gpu else self.model.module.SpecTransformer.state_dict( ) if hasattr(self.model, 'SPE'): all_states[ 'SPE'] = self.model.SPE if not self.args.multi_gpu else self.model.module.SPE if hasattr(self.model, 'PhoneticTransformer'): all_states[ 'PhoneticTransformer'] = self.model.PhoneticTransformer.Transformer.state_dict( ) if not self.args.multi_gpu else self.model.module.PhoneticTransformer.Transformer.state_dict( ) if hasattr(self.model.PhoneticTransformer, 'PhoneRecognizer'): all_states[ 'PhoneticLayer'] = self.model.PhoneticTransformer.PhoneRecognizer.state_dict( ) if not self.args.multi_gpu else self.model.module.PhoneticTransformer.PhoneRecognizer.state_dict( ) if hasattr(self.model, 'SpeakerTransformer'): all_states[ 'SpeakerTransformer'] = self.model.SpeakerTransformer.Transformer.state_dict( ) if not self.args.multi_gpu else self.model.module.SpeakerTransformer.Transformer.state_dict( ) if hasattr(self.model.SpeakerTransformer, 'SpeakerRecognizer'): all_states[ 'SpeakerLayer'] = self.model.SpeakerTransformer.SpeakerRecognizer.state_dict( ) if not self.args.multi_gpu else self.model.module.SpeakerTransformer.SpeakerRecognizer.state_dict( ) else: all_states = { 'SpecHead': self.model.SpecHead.state_dict() if not self.args.multi_gpu else self.model.module.SpecHead.state_dict(), 'Transformer': self.model.Transformer.state_dict() if not self.args.multi_gpu else self.model.module.Transformer.state_dict(), } all_states['Optimizer'] = self.optimizer.state_dict() all_states['Global_step'] = self.global_step all_states['Settings'] = {'Config': self.config, 'Paras': self.args} if to_path is None: new_model_path = '{}/{}-{}.ckpt'.format(self.ckpdir, name, self.global_step) else: new_model_path = to_path torch.save(all_states, new_model_path) self.model_kept.append(new_model_path) if len(self.model_kept) >= self.max_keep: os.remove(self.model_kept[0]) self.model_kept.pop(0) def up_sample_frames(self, spec, return_first=False): if len(spec.shape) != 3: spec = spec.unsqueeze(0) assert (len(spec.shape) == 3 ), 'Input should have acoustic feature of shape BxTxD' # spec shape: [batch_size, sequence_length // downsample_rate, output_dim * downsample_rate] spec_flatten = spec.view(spec.shape[0], spec.shape[1] * self.dr, spec.shape[2] // self.dr) if return_first: return spec_flatten[0] return spec_flatten # spec_flatten shape: [batch_size, sequence_length * downsample_rate, output_dim // downsample_rate] def down_sample_frames(self, spec): left_over = spec.shape[1] % self.dr if left_over != 0: spec = spec[:, :-left_over, :] spec_stacked = spec.view(spec.shape[0], spec.shape[1] // self.dr, spec.shape[2] * self.dr) return spec_stacked def process_data(self, spec): """Process training data for the masked acoustic model""" with torch.no_grad(): assert ( len(spec) == 5 ), 'dataloader should return (spec_masked, pos_enc, mask_label, attn_mask, spec_stacked)' # Unpack and Hack bucket: Bucketing should cause acoustic feature to have shape 1xBxTxD' spec_masked = spec[0].squeeze(0) pos_enc = spec[1].squeeze(0) mask_label = spec[2].squeeze(0) attn_mask = spec[3].squeeze(0) spec_stacked = spec[4].squeeze(0) spec_masked = spec_masked.to(device=self.device) if pos_enc.dim() == 3: # pos_enc: (batch_size, seq_len, hidden_size) # GPU memory need (batch_size * seq_len * hidden_size) pos_enc = pos_enc.float().to(device=self.device) elif pos_enc.dim() == 2: # pos_enc: (seq_len, hidden_size) # GPU memory only need (seq_len * hidden_size) even after expanded pos_enc = pos_enc.float().to(device=self.device).expand( spec_masked.size(0), *pos_enc.size()) mask_label = mask_label.bool().to(device=self.device) attn_mask = attn_mask.float().to(device=self.device) spec_stacked = spec_stacked.to(device=self.device) return spec_masked, pos_enc, mask_label, attn_mask, spec_stacked # (x, pos_enc, mask_label, attention_mask. y) def process_dual_data(self, spec): """Process training data for the dual masked acoustic model""" with torch.no_grad(): assert ( len(spec) == 6 ), 'dataloader should return (time_masked, freq_masked, pos_enc, mask_label, attn_mask, spec_stacked)' # Unpack and Hack bucket: Bucketing should cause acoustic feature to have shape 1xBxTxD' time_masked = spec[0].squeeze(0) freq_masked = spec[1].squeeze(0) pos_enc = spec[2].squeeze(0) mask_label = spec[3].squeeze(0) attn_mask = spec[4].squeeze(0) spec_stacked = spec[5].squeeze(0) time_masked = time_masked.to(device=self.device) freq_masked = freq_masked.to(device=self.device) if pos_enc.dim() == 3: # pos_enc: (batch_size, seq_len, hidden_size) # GPU memory need (batch_size * seq_len * hidden_size) pos_enc = pos_enc.float().to(device=self.device) elif pos_enc.dim() == 2: # pos_enc: (seq_len, hidden_size) # GPU memory only need (seq_len * hidden_size) even after expanded pos_enc = pos_enc.float().to(device=self.device).expand( time_masked.size(0), *pos_enc.size()) mask_label = mask_label.bool().to(device=self.device) attn_mask = attn_mask.float().to(device=self.device) spec_stacked = spec_stacked.to(device=self.device) return time_masked, freq_masked, pos_enc, mask_label, attn_mask, spec_stacked # (x, pos_enc, mask_label, attention_mask. y) def train(self): ''' Self-Supervised Pre-Training of Transformer Model''' pbar = tqdm(total=self.total_steps) while self.global_step <= self.total_steps: progress = tqdm(self.dataloader, desc="Iteration") step = 0 loss_val = 0 for batch in progress: batch_is_valid, *batch = batch try: if self.global_step > self.total_steps: break if not batch_is_valid: continue step += 1 if self.dual_transformer: time_masked, freq_masked, pos_enc, mask_label, attn_mask, spec_stacked = self.process_dual_data( batch) loss, pred_spec = self.model(time_masked, freq_masked, pos_enc, mask_label, attn_mask, spec_stacked) else: spec_masked, pos_enc, mask_label, attn_mask, spec_stacked = self.process_data( batch) loss, pred_spec = self.model(spec_masked, pos_enc, mask_label, attn_mask, spec_stacked) # Accumulate Loss if self.gradient_accumulation_steps > 1: loss = loss / self.gradient_accumulation_steps if self.apex and self.args.multi_gpu: raise NotImplementedError elif self.apex: self.optimizer.backward(loss) elif self.args.multi_gpu: loss = loss.sum() loss.backward() else: loss.backward() loss_val += loss.item() # Update if (step + 1) % self.gradient_accumulation_steps == 0: if self.apex: # modify learning rate with special warm up BERT uses # if conifg.apex is False, BertAdam is used and handles this automatically lr_this_step = self.learning_rate * self.warmup_linear.get_lr( self.global_step, self.warmup_proportion) for param_group in self.optimizer.param_groups: param_group['lr'] = lr_this_step # Step grad_norm = torch.nn.utils.clip_grad_norm_( self.model.parameters(), self.gradient_clipping) if math.isnan(grad_norm): print( '[Runner] - Error : grad norm is NaN @ step ' + str(self.global_step)) else: self.optimizer.step() self.optimizer.zero_grad() if self.global_step % self.log_step == 0: # Log self.log.add_scalar('lr', self.optimizer.get_lr()[0], self.global_step) self.log.add_scalar('loss', (loss_val), self.global_step) self.log.add_scalar('gradient norm', grad_norm, self.global_step) progress.set_description("Loss %.4f" % (loss_val)) if self.global_step % self.save_step == 0: self.save_model('states') # tensorboard log if self.dual_transformer: spec_masked = time_masked spec_list = [spec_masked, pred_spec, spec_stacked] name_list = ['mask_spec', 'pred_spec', 'true_spec'] if self.dual_transformer: spec_list.insert(1, freq_masked) name_list.insert(1, 'mask_freq') name_list[0] = 'mask_time' for i in range(len(spec_list)): if i == 0 and self.wave_transformer: self.log.add_audio( name_list[0], spec_list[0][0].data.cpu().numpy(), self.global_step, self.config['online']['sample_rate']) continue spec = self.up_sample_frames(spec_list[i][0], return_first=True) spec = plot_spectrogram_to_numpy( spec.data.cpu().numpy()) self.log.add_image(name_list[i], spec, self.global_step) loss_val = 0 pbar.update(1) self.global_step += 1 except RuntimeError as e: if 'CUDA out of memory' in str(e): print('CUDA out of memory at step: ', self.global_step) torch.cuda.empty_cache() self.optimizer.zero_grad() else: raise pbar.close() self.log.close()
def synthesis(text_input, args): place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()) with open(args.config_path) as f: cfg = yaml.load(f, Loader=yaml.Loader) # tensorboard if not os.path.exists(args.log_dir): os.mkdir(args.log_dir) path = os.path.join(args.log_dir, 'synthesis') writer = SummaryWriter(path) with dg.guard(place): with fluid.unique_name.guard(): model = TransformerTTS(cfg) model.set_dict( load_checkpoint( str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer"))) model.eval() with fluid.unique_name.guard(): model_vocoder = Vocoder(cfg, args.batch_size) model_vocoder.set_dict( load_checkpoint( str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder"))) model_vocoder.eval() # init input text = np.asarray(text_to_sequence(text_input)) text = fluid.layers.unsqueeze(dg.to_variable(text), [0]) mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32) pos_text = np.arange(1, text.shape[1] + 1) pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0]) pbar = tqdm(range(args.max_len)) for i in pbar: dec_slf_mask = get_triu_tensor( mel_input.numpy(), mel_input.numpy()).astype(np.float32) dec_slf_mask = fluid.layers.cast( dg.to_variable(dec_slf_mask != 0), np.float32) * (-2**32 + 1) pos_mel = np.arange(1, mel_input.shape[1] + 1) pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0]) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( text, mel_input, pos_text, pos_mel, dec_slf_mask) mel_input = fluid.layers.concat( [mel_input, postnet_pred[:, -1:, :]], axis=1) mag_pred = model_vocoder(postnet_pred) _ljspeech_processor = audio.AudioProcessor( sample_rate=cfg['audio']['sr'], num_mels=cfg['audio']['num_mels'], min_level_db=cfg['audio']['min_level_db'], ref_level_db=cfg['audio']['ref_level_db'], n_fft=cfg['audio']['n_fft'], win_length=cfg['audio']['win_length'], hop_length=cfg['audio']['hop_length'], power=cfg['audio']['power'], preemphasis=cfg['audio']['preemphasis'], signal_norm=True, symmetric_norm=False, max_norm=1., mel_fmin=0, mel_fmax=None, clip_norm=True, griffin_lim_iters=60, do_trim_silence=False, sound_norm=False) wav = _ljspeech_processor.inv_spectrogram( fluid.layers.transpose( fluid.layers.squeeze(mag_pred, [0]), [1, 0]).numpy()) global_step = 0 for i, prob in enumerate(attn_probs): for j in range(4): x = np.uint8(cm.viridis(prob.numpy()[j]) * 255) writer.add_image( 'Attention_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") for i, prob in enumerate(attn_enc): for j in range(4): x = np.uint8(cm.viridis(prob.numpy()[j]) * 255) writer.add_image( 'Attention_enc_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") for i, prob in enumerate(attn_dec): for j in range(4): x = np.uint8(cm.viridis(prob.numpy()[j]) * 255) writer.add_image( 'Attention_dec_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") writer.add_audio(text_input, wav, 0, cfg['audio']['sr']) if not os.path.exists(args.sample_path): os.mkdir(args.sample_path) write( os.path.join(args.sample_path, 'test.wav'), cfg['audio']['sr'], wav) writer.close()
class TensorboardLogger(object): def __init__(self, log_dir, model_name): self.model_name = model_name self.writer = SummaryWriter(log_dir) self.train_stats = {} self.eval_stats = {} def tb_model_weights(self, model, step): layer_num = 1 for name, param in model.named_parameters(): if param.numel() == 1: self.writer.add_scalar( "layer{}-{}/value".format(layer_num, name), param.max(), step) else: self.writer.add_scalar( "layer{}-{}/max".format(layer_num, name), param.max(), step) self.writer.add_scalar( "layer{}-{}/min".format(layer_num, name), param.min(), step) self.writer.add_scalar( "layer{}-{}/mean".format(layer_num, name), param.mean(), step) self.writer.add_scalar( "layer{}-{}/std".format(layer_num, name), param.std(), step) self.writer.add_histogram( "layer{}-{}/param".format(layer_num, name), param, step) self.writer.add_histogram( "layer{}-{}/grad".format(layer_num, name), param.grad, step) layer_num += 1 def dict_to_tb_scalar(self, scope_name, stats, step): for key, value in stats.items(): self.writer.add_scalar('{}/{}'.format(scope_name, key), value, step) def dict_to_tb_figure(self, scope_name, figures, step): for key, value in figures.items(): self.writer.add_figure('{}/{}'.format(scope_name, key), value, step) def dict_to_tb_audios(self, scope_name, audios, step, sample_rate): for key, value in audios.items(): try: self.writer.add_audio('{}/{}'.format(scope_name, key), value, step, sample_rate=sample_rate) except: traceback.print_exc() def tb_train_iter_stats(self, step, stats): self.dict_to_tb_scalar(f"{self.model_name}_TrainIterStats", stats, step) def tb_train_epoch_stats(self, step, stats): self.dict_to_tb_scalar(f"{self.model_name}_TrainEpochStats", stats, step) def tb_train_figures(self, step, figures): self.dict_to_tb_figure(f"{self.model_name}_TrainFigures", figures, step) def tb_train_audios(self, step, audios, sample_rate): self.dict_to_tb_audios(f"{self.model_name}_TrainAudios", audios, step, sample_rate) def tb_eval_stats(self, step, stats): self.dict_to_tb_scalar(f"{self.model_name}_EvalStats", stats, step) def tb_eval_figures(self, step, figures): self.dict_to_tb_figure(f"{self.model_name}_EvalFigures", figures, step) def tb_eval_audios(self, step, audios, sample_rate): self.dict_to_tb_audios(f"{self.model_name}_EvalAudios", audios, step, sample_rate) def tb_test_audios(self, step, audios, sample_rate): self.dict_to_tb_audios(f"{self.model_name}_TestAudios", audios, step, sample_rate) def tb_test_figures(self, step, figures): self.dict_to_tb_figure(f"{self.model_name}_TestFigures", figures, step) def tb_add_text(self, title, text, step): self.writer.add_text(title, text, step)
def do_summary(self, m_summary, sess, itr): valid_path = self.valid_path clean_speech = self.clean_speech clean_speech = utils.identity_trans(clean_speech) noisy_speech = self.noisy_speech noisy_speech = utils.identity_trans(noisy_speech) temp_dir = self.temp_dir name = self.name logs_dir = self.logs_dir writer = SummaryWriter(log_dir=self.logs_dir + '/summary') summary_dr = dr.DataReader(temp_dir, '', valid_path["norm_path"], dist_num=config.dist_num, is_training=False, is_shuffle=False) pred = [] while True: summary_inputs, summary_labels = summary_dr.next_batch(config.batch_size) feed_dict = {m_summary.inputs: summary_inputs, m_summary.labels: summary_labels, m_summary.keep_prob: 1.0} pred_temp = sess.run(m_summary.pred, feed_dict=feed_dict) pred.append(pred_temp) if summary_dr.file_change_checker(): phase = summary_dr.phase[0] lpsd = np.expand_dims( np.reshape(np.concatenate(pred, axis=0), [-1, config.freq_size])[0:phase.shape[0], :], axis=2) mean, std = summary_dr.norm_process(valid_path["norm_path"] + '/norm_noisy.mat') lpsd = np.squeeze((lpsd * std * 1.18) + mean) # denorm recon_speech = utils.get_recon(np.transpose(lpsd, (1, 0)), np.transpose(phase, (1, 0)), win_size=config.win_size, win_step=config.win_step, fs=config.fs) # plt.plot(recon_speech) # plt.show() # lab = np.reshape(np.asarray(lab), [-1, 1]) summary_dr.reader_initialize() break # write summary if itr == config.summary_step: writer.close() self.noisy_measure = utils.se_eval(clean_speech, np.squeeze(noisy_speech), float(config.fs)) summary_fname = tf.summary.text(name + '_filename', tf.convert_to_tensor(self.noisy_dir)) if name == 'train': config_str = "<br>sampling frequency: %d</br>" \ "<br>window step: %d ms</br>" \ "<br>window size: %d ms</br>" \ "<br>fft size: %d</br>" \ "<br>learning rate: %f</br><br>learning rate decay: %.4f</br><br>learning" \ " rate decay frequency: %.4d</br>" \ "<br>dropout rate: %.4f</br><br>max epoch:" \ " %.4e</br><br>batch size: %d</br><br>model type: %s</br>"\ % (config.fs, (config.win_step/config.fs*1000), (config.win_size/config.fs*1000), config.nfft, config.lr, config.lrDecayRate, config.lrDecayFreq, config.keep_prob, config.max_epoch, config.batch_size, config.mode) summary_config = tf.summary.text(name + '_configuration', tf.convert_to_tensor(config_str)) code_list = [] read_flag = False with open('./lib/trnmodel.py', 'r') as f: while True: line = f.readline() if "def inference(self, inputs):" in line: read_flag = True if "return fm" in line: code_list.append('<br>' + line.replace('\n', '') + '</br>') break if read_flag: code_list.append('<br>' + line.replace('\n', '') + '</br>') code_list = "<pre>" + "".join(code_list) + "</pre>" summary_model = tf.summary.text('train_model', tf.convert_to_tensor(code_list)) summary_op = tf.summary.merge([summary_fname, summary_config, summary_model]) else: summary_op = tf.summary.merge([summary_fname]) with tf.Session() as sess: summary_writer = tf.summary.FileWriter(logs_dir + '/summary/text') text = sess.run(summary_op) summary_writer.add_summary(text, 1) summary_writer.close() writer = SummaryWriter(log_dir=logs_dir + '/summary') writer.add_audio(name + '_audio_ref' + '/clean', clean_speech /np.max(np.abs(clean_speech)), itr, sample_rate=config.fs) writer.add_audio(name + '_audio_ref' + '/noisy', noisy_speech /np.max(np.abs(noisy_speech)), itr, sample_rate=config.fs) clean_S = get_spectrogram(clean_speech) noisy_S = get_spectrogram(noisy_speech) writer.add_image(name + '_spectrogram_ref' + '/clean', clean_S, itr) # image_shape = (C, H, W) writer.add_image(name + '_spectrogram_ref' + '/noisy', noisy_S, itr) # image_shape = (C, H, W) enhanced_measure = utils.se_eval(clean_speech, recon_speech, float(config.fs)) writer.add_scalars(name + '_speech_quality' + '/pesq', {'enhanced': enhanced_measure['pesq'], 'ref': self.noisy_measure['pesq']}, itr) writer.add_scalars(name + '_speech_quality' + '/stoi', {'enhanced': enhanced_measure['stoi'], 'ref': self.noisy_measure['stoi']}, itr) writer.add_scalars(name + '_speech_quality' + '/lsd', {'enhanced': enhanced_measure['lsd'], 'ref': self.noisy_measure['lsd']}, itr) writer.add_scalars(name + '_speech_quality' + '/ssnr', {'enhanced': enhanced_measure['ssnr'], 'ref': self.noisy_measure['ssnr']}, itr) writer.add_audio(name + '_audio_enhanced' + '/enhanced', recon_speech/np.max(np.abs(recon_speech)), itr, sample_rate=config.fs) enhanced_S = get_spectrogram(recon_speech) writer.add_image(name + '_spectrogram_enhanced' + '/enhanced', enhanced_S, itr) # image_shape = (C, H, W) writer.close()
# save sampled audio at the beginning of each epoch if i == 0: fake_speech = generator(fixed_test_noise, z) fake_speech_data = fake_speech.data.cpu().numpy( ) # convert to numpy array fake_speech_data = emph.de_emphasis(fake_speech_data, emph_coeff=0.95) for idx in range(4): # select four samples generated_sample = fake_speech_data[idx] gen_fname = test_noise_filenames[idx] filepath = os.path.join( gen_data_path, '{}_e{}.wav'.format(gen_fname, epoch + 1)) # write to file wavfile.write(filepath, sample_rate, generated_sample.T) # write for tensorboard log tbwriter.add_audio(gen_fname, generated_sample.T, total_steps, sample_rate) # increment total steps total_steps += 1 # save the model parameters for each epoch g_path = os.path.join(models_path, 'generator-{}.pkl'.format(epoch + 1)) d_path = os.path.join(models_path, 'discriminator-{}.pkl'.format(epoch + 1)) torch.save(generator.state_dict(), g_path) torch.save(discriminator.state_dict(), d_path) tbwriter.close() print('Finished Training!')
def synthesis(text_input, args): local_rank = dg.parallel.Env().local_rank place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()) with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) # tensorboard if not os.path.exists(args.output): os.mkdir(args.output) writer = SummaryWriter(os.path.join(args.output, 'log')) fluid.enable_dygraph(place) with fluid.unique_name.guard(): network_cfg = cfg['network'] model = TransformerTTS( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], cfg['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) # Load parameters. global_step = io.load_parameters( model=model, checkpoint_path=args.checkpoint_transformer) model.eval() with fluid.unique_name.guard(): model_vocoder = Vocoder(cfg['train']['batch_size'], cfg['vocoder']['hidden_size'], cfg['audio']['num_mels'], cfg['audio']['n_fft']) # Load parameters. global_step = io.load_parameters( model=model_vocoder, checkpoint_path=args.checkpoint_vocoder) model_vocoder.eval() # init input text = np.asarray(text_to_sequence(text_input)) text = fluid.layers.unsqueeze(dg.to_variable(text), [0]) mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32) pos_text = np.arange(1, text.shape[1] + 1) pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0]) pbar = tqdm(range(args.max_len)) for i in pbar: pos_mel = np.arange(1, mel_input.shape[1] + 1) pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0]) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( text, mel_input, pos_text, pos_mel) mel_input = fluid.layers.concat([mel_input, postnet_pred[:, -1:, :]], axis=1) mag_pred = model_vocoder(postnet_pred) _ljspeech_processor = audio.AudioProcessor( sample_rate=cfg['audio']['sr'], num_mels=cfg['audio']['num_mels'], min_level_db=cfg['audio']['min_level_db'], ref_level_db=cfg['audio']['ref_level_db'], n_fft=cfg['audio']['n_fft'], win_length=cfg['audio']['win_length'], hop_length=cfg['audio']['hop_length'], power=cfg['audio']['power'], preemphasis=cfg['audio']['preemphasis'], signal_norm=True, symmetric_norm=False, max_norm=1., mel_fmin=0, mel_fmax=None, clip_norm=True, griffin_lim_iters=60, do_trim_silence=False, sound_norm=False) # synthesis with cbhg wav = _ljspeech_processor.inv_spectrogram( fluid.layers.transpose(fluid.layers.squeeze(mag_pred, [0]), [1, 0]).numpy()) global_step = 0 for i, prob in enumerate(attn_probs): for j in range(4): x = np.uint8(cm.viridis(prob.numpy()[j]) * 255) writer.add_image('Attention_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") writer.add_audio(text_input + '(cbhg)', wav, 0, cfg['audio']['sr']) if not os.path.exists(os.path.join(args.output, 'samples')): os.mkdir(os.path.join(args.output, 'samples')) write(os.path.join(os.path.join(args.output, 'samples'), 'cbhg.wav'), cfg['audio']['sr'], wav) # synthesis with griffin-lim wav = _ljspeech_processor.inv_melspectrogram( fluid.layers.transpose(fluid.layers.squeeze(postnet_pred, [0]), [1, 0]).numpy()) writer.add_audio(text_input + '(griffin)', wav, 0, cfg['audio']['sr']) write(os.path.join(os.path.join(args.output, 'samples'), 'griffin.wav'), cfg['audio']['sr'], wav) print("Synthesis completed !!!") writer.close()
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, iters_per_checkpoint, iters_per_eval, batch_size, seed, checkpoint_path, log_dir, ema_decay=0.9999): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== if train_data_config["no_chunks"]: criterion = MaskedCrossEntropyLoss() else: criterion = CrossEntropyLoss() model = WaveNet(**wavenet_config).cuda() ema = ExponentialMovingAverage(ema_decay) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) scheduler = StepLR(optimizer, step_size=200000, gamma=0.5) # Load checkpoint if one exists iteration = 0 if checkpoint_path != "": model, optimizer, scheduler, iteration, ema = load_checkpoint(checkpoint_path, model, optimizer, scheduler, ema) iteration += 1 # next iteration is iteration + 1 trainset = Mel2SampOnehot(audio_config=audio_config, verbose=True, **train_data_config) validset = Mel2SampOnehot(audio_config=audio_config, verbose=False, **valid_data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None valid_sampler = DistributedSampler(validset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== print(train_data_config) if train_data_config["no_chunks"]: collate_fn = utils.collate_fn else: collate_fn = torch.utils.data.dataloader.default_collate train_loader = DataLoader(trainset, num_workers=1, shuffle=False, collate_fn=collate_fn, sampler=train_sampler, batch_size=batch_size, pin_memory=True, drop_last=True) valid_loader = DataLoader(validset, num_workers=1, shuffle=False, sampler=valid_sampler, batch_size=1, pin_memory=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) writer = SummaryWriter(log_dir) print("Checkpoints writing to: {}".format(log_dir)) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): if low_memory: torch.cuda.empty_cache() scheduler.step() model.zero_grad() if train_data_config["no_chunks"]: x, y, seq_lens = batch seq_lens = to_gpu(seq_lens) else: x, y = batch x = to_gpu(x).float() y = to_gpu(y) x = (x, y) # auto-regressive takes outputs as inputs y_pred = model(x) if train_data_config["no_chunks"]: loss = criterion(y_pred, y, seq_lens) else: loss = criterion(y_pred, y) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus)[0] else: reduced_loss = loss.data[0] loss.backward() optimizer.step() for name, param in model.named_parameters(): if name in ema.shadow: ema.update(name, param.data) print("{}:\t{:.9f}".format(iteration, reduced_loss)) if rank == 0: writer.add_scalar('loss', reduced_loss, iteration) if (iteration % iters_per_checkpoint == 0 and iteration): if rank == 0: checkpoint_path = "{}/wavenet_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, scheduler, learning_rate, iteration, checkpoint_path, ema, wavenet_config) if (iteration % iters_per_eval == 0 and iteration > 0 and not config["no_validation"]): if low_memory: torch.cuda.empty_cache() if rank == 0: model_eval = nv_wavenet.NVWaveNet(**(model.export_weights())) for j, valid_batch in enumerate(valid_loader): mel, audio = valid_batch mel = to_gpu(mel).float() cond_input = model.get_cond_input(mel) predicted_audio = model_eval.infer(cond_input, nv_wavenet.Impl.AUTO) predicted_audio = utils.mu_law_decode_numpy(predicted_audio[0, :].cpu().numpy(), 256) writer.add_audio("valid/predicted_audio_{}".format(j), predicted_audio, iteration, 22050) audio = utils.mu_law_decode_numpy(audio[0, :].cpu().numpy(), 256) writer.add_audio("valid_true/audio_{}".format(j), audio, iteration, 22050) if low_memory: torch.cuda.empty_cache() iteration += 1
def main(args): # setup output paths and read configs c = load_config(args.config_path) _ = os.path.dirname(os.path.realpath(__file__)) OUT_PATH = os.path.join(_, c.output_path) OUT_PATH = create_experiment_folder(OUT_PATH) CHECKPOINT_PATH = os.path.join(OUT_PATH, 'checkpoints') shutil.copyfile(args.config_path, os.path.join(OUT_PATH, 'config.json')) # save config to tmp place to be loaded by subsequent modules. file_name = str(os.getpid()) tmp_path = os.path.join("/tmp/", file_name+'_tts') pickle.dump(c, open(tmp_path, "wb")) # setup tensorboard LOG_DIR = OUT_PATH tb = SummaryWriter(LOG_DIR) # Ctrl+C handler to remove empty experiment folder def signal_handler(signal, frame): print(" !! Pressed Ctrl+C !!") remove_experiment_folder(OUT_PATH) sys.exit(1) signal.signal(signal.SIGINT, signal_handler) # Setup the dataset dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata.csv'), os.path.join(c.data_path, 'wavs'), c.r, c.sample_rate, c.text_cleaner, c.num_mels, c.min_level_db, c.frame_shift_ms, c.frame_length_ms, c.preemphasis, c.ref_level_db, c.num_freq, c.power ) dataloader = DataLoader(dataset, batch_size=c.batch_size, shuffle=True, collate_fn=dataset.collate_fn, drop_last=True, num_workers=c.num_loader_workers) # setup the model model = Tacotron(c.embedding_size, c.hidden_size, c.num_mels, c.num_freq, c.r) # plot model on tensorboard dummy_input = dataset.get_dummy_data() ## TODO: onnx does not support RNN fully yet # model_proto_path = os.path.join(OUT_PATH, "model.proto") # onnx.export(model, dummy_input, model_proto_path, verbose=True) # tb.add_graph_onnx(model_proto_path) if use_cuda: model = nn.DataParallel(model.cuda()) optimizer = optim.Adam(model.parameters(), lr=c.lr) if args.restore_step: checkpoint = torch.load(os.path.join( args.restore_path, 'checkpoint_%d.pth.tar' % args.restore_step)) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("\n > Model restored from step %d\n" % args.restore_step) start_epoch = checkpoint['step'] // len(dataloader) best_loss = checkpoint['linear_loss'] else: start_epoch = 0 print("\n > Starting a new training") num_params = count_parameters(model) print(" | > Model has {} parameters".format(num_params)) model = model.train() if not os.path.exists(CHECKPOINT_PATH): os.mkdir(CHECKPOINT_PATH) if use_cuda: criterion = nn.L1Loss().cuda() else: criterion = nn.L1Loss() n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq) #lr_scheduler = ReduceLROnPlateau(optimizer, factor=c.lr_decay, # patience=c.lr_patience, verbose=True) epoch_time = 0 best_loss = float('inf') for epoch in range(0, c.epochs): print("\n | > Epoch {}/{}".format(epoch, c.epochs)) progbar = Progbar(len(dataset) / c.batch_size) for num_iter, data in enumerate(dataloader): start_time = time.time() text_input = data[0] text_lengths = data[1] linear_input = data[2] mel_input = data[3] current_step = num_iter + args.restore_step + epoch * len(dataloader) + 1 # setup lr current_lr = lr_decay(c.lr, current_step) for params_group in optimizer.param_groups: params_group['lr'] = current_lr optimizer.zero_grad() # Add a single frame of zeros to Mel Specs for better end detection #try: # mel_input = np.concatenate((np.zeros( # [c.batch_size, 1, c.num_mels], dtype=np.float32), # mel_input[:, 1:, :]), axis=1) #except: # raise TypeError("not same dimension") # convert inputs to variables text_input_var = Variable(text_input) mel_spec_var = Variable(mel_input) linear_spec_var = Variable(linear_input, volatile=True) # sort sequence by length. # TODO: might be unnecessary sorted_lengths, indices = torch.sort( text_lengths.view(-1), dim=0, descending=True) sorted_lengths = sorted_lengths.long().numpy() text_input_var = text_input_var[indices] mel_spec_var = mel_spec_var[indices] linear_spec_var = linear_spec_var[indices] if use_cuda: text_input_var = text_input_var.cuda() mel_spec_var = mel_spec_var.cuda() linear_spec_var = linear_spec_var.cuda() mel_output, linear_output, alignments =\ model.forward(text_input_var, mel_spec_var, input_lengths= torch.autograd.Variable(torch.cuda.LongTensor(sorted_lengths))) mel_loss = criterion(mel_output, mel_spec_var) #linear_loss = torch.abs(linear_output - linear_spec_var) #linear_loss = 0.5 * \ #torch.mean(linear_loss) + 0.5 * \ #torch.mean(linear_loss[:, :n_priority_freq, :]) linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \ + 0.5 * criterion(linear_output[:, :, :n_priority_freq], linear_spec_var[: ,: ,:n_priority_freq]) loss = mel_loss + linear_loss # loss = loss.cuda() loss.backward() grad_norm = nn.utils.clip_grad_norm(model.parameters(), 1.) ## TODO: maybe no need optimizer.step() step_time = time.time() - start_time epoch_time += step_time progbar.update(num_iter+1, values=[('total_loss', loss.data[0]), ('linear_loss', linear_loss.data[0]), ('mel_loss', mel_loss.data[0]), ('grad_norm', grad_norm)]) # Plot Learning Stats tb.add_scalar('Loss/TotalLoss', loss.data[0], current_step) tb.add_scalar('Loss/LinearLoss', linear_loss.data[0], current_step) tb.add_scalar('Loss/MelLoss', mel_loss.data[0], current_step) tb.add_scalar('Params/LearningRate', optimizer.param_groups[0]['lr'], current_step) tb.add_scalar('Params/GradNorm', grad_norm, current_step) tb.add_scalar('Time/StepTime', step_time, current_step) align_img = alignments[0].data.cpu().numpy() align_img = plot_alignment(align_img) tb.add_image('Attn/Alignment', align_img, current_step) if current_step % c.save_step == 0: if c.checkpoint: # save model save_checkpoint(model, optimizer, linear_loss.data[0], OUT_PATH, current_step, epoch) # Diagnostic visualizations const_spec = linear_output[0].data.cpu().numpy() gt_spec = linear_spec_var[0].data.cpu().numpy() const_spec = plot_spectrogram(const_spec, dataset.ap) gt_spec = plot_spectrogram(gt_spec, dataset.ap) tb.add_image('Spec/Reconstruction', const_spec, current_step) tb.add_image('Spec/GroundTruth', gt_spec, current_step) align_img = alignments[0].data.cpu().numpy() align_img = plot_alignment(align_img) tb.add_image('Attn/Alignment', align_img, current_step) # Sample audio audio_signal = linear_output[0].data.cpu().numpy() dataset.ap.griffin_lim_iters = 60 audio_signal = dataset.ap.inv_spectrogram(audio_signal.T) try: tb.add_audio('SampleAudio', audio_signal, current_step, sample_rate=c.sample_rate) except: print("\n > Error at audio signal on TB!!") print(audio_signal.max()) print(audio_signal.min()) # average loss after the epoch avg_epoch_loss = np.mean( progbar.sum_values['linear_loss'][0] / max(1, progbar.sum_values['linear_loss'][1])) best_loss = save_best_model(model, optimizer, avg_epoch_loss, best_loss, OUT_PATH, current_step, epoch) #lr_scheduler.step(loss.data[0]) tb.add_scalar('Time/EpochTime', epoch_time, epoch) epoch_time = 0
def collect_to_tfevents( input_dir: Path, output_dir: Optional[Path], filename_suffix: str, audio_tag_format: str, diff_tag: str, iteration_format: str, remove_exist: bool, expected_wave_dir: Optional[Path], ): if output_dir is None: output_dir = input_dir if remove_exist: for p in output_dir.glob(f"*tfevents*{filename_suffix}"): p.unlink() flag_calc_diff = expected_wave_dir is not None summary_writer = SummaryWriter(logdir=str(output_dir), filename_suffix=filename_suffix) diffs: DefaultDict[int, List[float]] = defaultdict(list) for p in tqdm(sorted(input_dir.rglob("*"), key=_to_nums), desc=input_dir.stem): if p.is_dir(): continue if "tfevents" in p.name: continue rp = p.relative_to(input_dir) iteration = int(iteration_format.format(p=p, rp=rp)) # audio if p.suffix in [".wav"]: wave, sr = librosa.load(str(p), sr=None) summary_writer.add_audio( tag=audio_tag_format.format(p=p, rp=rp), snd_tensor=wave, sample_rate=sr, global_step=iteration, ) # diff if flag_calc_diff and p.name.endswith("_woc.wav"): wave_id = p.name[:-8] expected = expected_wave_dir.joinpath(f"{wave_id}.wav") diff = calc_mcd(path1=expected, path2=p) diffs[iteration].append(diff) if flag_calc_diff: for iteration, values in sorted(diffs.items()): summary_writer.add_scalar( tag=diff_tag, scalar_value=numpy.mean(values), global_step=iteration, ) summary_writer.close()