def _initialize(self): """ initialize with the necessary elements """ self.tts_checkpoint_path = os.path.join(self.directory, "assets", "tts", "step-162000") self.waveflow_checkpoint_path = os.path.join(self.directory, "assets", "vocoder", "step-2000000") self.waveflow_config_path = os.path.join(self.directory, "assets", "vocoder", "waveflow_ljspeech.yaml") tts_config_path = os.path.join(self.directory, "assets", "tts", "ljspeech.yaml") with open(tts_config_path) as f: self.tts_config = yaml.load(f, Loader=yaml.Loader) with fluid.dygraph.guard(fluid.CPUPlace()): self.tts_model = FastSpeechModel( self.tts_config['network'], num_mels=self.tts_config['audio']['num_mels']) io.load_parameters(model=self.tts_model, checkpoint_path=self.tts_checkpoint_path) # Build vocoder. args = AttrDict() args.config = self.waveflow_config_path args.use_fp16 = False self.waveflow_config = io.add_yaml_config_to_args(args) self.waveflow = WaveFlowModule(self.waveflow_config) io.load_parameters(model=self.waveflow, checkpoint_path=self.waveflow_checkpoint_path)
def build(self, training=True): """Initialize the model. Args: training (bool, optional): Whether the model is built for training or inference. Defaults to True. Returns: None """ config = self.config dataset = LJSpeech(config, self.nranks, self.rank) self.trainloader = dataset.trainloader self.validloader = dataset.validloader waveflow = WaveFlowModule(config) # Dry run once to create and initalize all necessary parameters. audio = dg.to_variable(np.random.randn(1, 16000).astype(self.dtype)) mel = dg.to_variable( np.random.randn(1, config.mel_bands, 63).astype(self.dtype)) waveflow(audio, mel) if training: optimizer = fluid.optimizer.AdamOptimizer( learning_rate=config.learning_rate, parameter_list=waveflow.parameters()) # Load parameters. iteration = io.load_parameters(model=waveflow, optimizer=optimizer, checkpoint_dir=self.checkpoint_dir, iteration=config.iteration, checkpoint_path=config.checkpoint) print("Rank {}: checkpoint loaded.".format(self.rank)) # Data parallelism. if self.parallel: strategy = dg.parallel.prepare_context() waveflow = dg.parallel.DataParallel(waveflow, strategy) self.waveflow = waveflow self.optimizer = optimizer self.criterion = WaveFlowLoss(config.sigma) else: # Load parameters. iteration = io.load_parameters(model=waveflow, checkpoint_dir=self.checkpoint_dir, iteration=config.iteration, checkpoint_path=config.checkpoint) print("Rank {}: checkpoint loaded.".format(self.rank)) for layer in waveflow.sublayers(): if isinstance(layer, weight_norm.WeightNormWrapper): layer.remove_weight_norm() self.waveflow = waveflow return iteration
def __init__(self, config_path, checkpoint_path): with open(config_path, 'rt') as f: config = ruamel.yaml.safe_load(f) ns = argparse.Namespace() for k, v in config.items(): setattr(ns, k, v) ns.use_fp16 = False self.model = WaveFlowModule(ns) io.load_parameters(self.model, checkpoint_path=checkpoint_path)
def __init__(self): config_path = "waveflow_res128_ljspeech_ckpt_1.0/waveflow_ljspeech.yaml" with open(config_path, 'rt') as f: config = yaml.safe_load(f) ns = argparse.Namespace() for k, v in config.items(): setattr(ns, k, v) ns.use_fp16 = False self.model = WaveFlowModule(ns) checkpoint_path = "waveflow_res128_ljspeech_ckpt_1.0/step-2000000" load_parameters(self.model, checkpoint_path=checkpoint_path)
def synthesis(text_input, args): local_rank = dg.parallel.Env().local_rank place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()) fluid.enable_dygraph(place) with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) # tensorboard if not os.path.exists(args.output): os.mkdir(args.output) writer = SummaryWriter(os.path.join(args.output, 'log')) model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels']) # Load parameters. global_step = io.load_parameters(model=model, checkpoint_path=args.checkpoint) model.eval() text = np.asarray(text_to_sequence(text_input)) text = np.expand_dims(text, axis=0) pos_text = np.arange(1, text.shape[1] + 1) pos_text = np.expand_dims(pos_text, axis=0) text = dg.to_variable(text) pos_text = dg.to_variable(pos_text) _, mel_output_postnet = model(text, pos_text, alpha=args.alpha) result = np.exp(mel_output_postnet.numpy()) mel_output_postnet = fluid.layers.transpose( fluid.layers.squeeze(mel_output_postnet, [0]), [1, 0]) mel_output_postnet = np.exp(mel_output_postnet.numpy()) basis = librosa.filters.mel(cfg['audio']['sr'], cfg['audio']['n_fft'], cfg['audio']['num_mels']) inv_basis = np.linalg.pinv(basis) spec = np.maximum(1e-10, np.dot(inv_basis, mel_output_postnet)) # synthesis use clarinet wav_clarinet = synthesis_with_clarinet(args.config_clarinet, args.checkpoint_clarinet, result, place) writer.add_audio(text_input + '(clarinet)', wav_clarinet, 0, cfg['audio']['sr']) if not os.path.exists(os.path.join(args.output, 'samples')): os.mkdir(os.path.join(args.output, 'samples')) write(os.path.join(os.path.join(args.output, 'samples'), 'clarinet.wav'), cfg['audio']['sr'], wav_clarinet) #synthesis use griffin-lim wav = librosa.core.griffinlim(spec**cfg['audio']['power'], hop_length=cfg['audio']['hop_length'], win_length=cfg['audio']['win_length']) writer.add_audio(text_input + '(griffin-lim)', wav, 0, cfg['audio']['sr']) write( os.path.join(os.path.join(args.output, 'samples'), 'grinffin-lim.wav'), cfg['audio']['sr'], wav) print("Synthesis completed !!!") writer.close()
def main(args, config): model = create_model(config) loaded_step = load_parameters(model, checkpoint_path=args.checkpoint) model.eval() if args.vocoder == "waveflow": vocoder = WaveflowVocoder() vocoder.model.eval() elif args.vocoder == "griffin-lim": vocoder = GriffinLimVocoder( sharpening_factor=config["sharpening_factor"], sample_rate=config["sample_rate"], n_fft=config["n_fft"], win_length=config["win_length"], hop_length=config["hop_length"]) else: raise ValueError("Other vocoders are not supported.") if not os.path.exists(args.output): os.makedirs(args.output) monotonic_layers = [ int(item.strip()) - 1 for item in args.monotonic_layers.split(',') ] with open(args.input, 'rt') as f: sentences = [line.strip() for line in f.readlines()] for i, sentence in enumerate(sentences): wav = synthesize(args, config, model, vocoder, sentence, monotonic_layers) sf.write(os.path.join(args.output, "sentence{}.wav".format(i)), wav, samplerate=config["sample_rate"])
def _initialize(self): """ initialize with the necessary elements """ self.tts_checkpoint_path = os.path.join(self.directory, "assets", "tts", "step-120000") self.waveflow_checkpoint_path = os.path.join(self.directory, "assets", "vocoder", "step-2000000") self.waveflow_config_path = os.path.join(self.directory, "assets", "vocoder", "waveflow_ljspeech.yaml") tts_config_path = os.path.join(self.directory, "assets", "tts", "ljspeech.yaml") with open(tts_config_path) as f: self.tts_config = yaml.load(f, Loader=yaml.Loader) # The max length of audio when synthsis. self.max_len = 1000 # The threshold of stop token which indicates the time step should stop generate spectrum or not. self.stop_threshold = 0.5 with fluid.dygraph.guard(fluid.CPUPlace()): # Build TTS. with fluid.unique_name.guard(): network_cfg = self.tts_config['network'] self.tts_model = TransformerTTSModel( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], self.tts_config['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) io.load_parameters(model=self.tts_model, checkpoint_path=self.tts_checkpoint_path) # Build vocoder. args = AttrDict() args.config = self.waveflow_config_path args.use_fp16 = False self.waveflow_config = io.add_yaml_config_to_args(args) self.waveflow = WaveFlowModule(self.waveflow_config) io.load_parameters(model=self.waveflow, checkpoint_path=self.waveflow_checkpoint_path)
def synthesis_with_waveflow(mel_output, args, checkpoint, place): fluid.enable_dygraph(place) args.config = args.config_vocoder args.use_fp16 = False config = io.add_yaml_config_to_args(args) mel_spectrogram = fluid.layers.transpose(mel_output, [0, 2, 1]) # Build model. waveflow = WaveFlowModule(config) io.load_parameters(model=waveflow, checkpoint_path=checkpoint) for layer in waveflow.sublayers(): if isinstance(layer, weight_norm.WeightNormWrapper): layer.remove_weight_norm() # Run model inference. wav = waveflow.synthesize(mel_spectrogram, sigma=config.sigma) return wav.numpy()[0]
def synthesis(text_input, args): local_rank = dg.parallel.Env().local_rank place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()) fluid.enable_dygraph(place) with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) # tensorboard if not os.path.exists(args.output): os.mkdir(args.output) writer = SummaryWriter(os.path.join(args.output, 'log')) model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels']) # Load parameters. global_step = io.load_parameters( model=model, checkpoint_path=args.checkpoint) model.eval() text = np.asarray(text_to_sequence(text_input)) text = np.expand_dims(text, axis=0) pos_text = np.arange(1, text.shape[1] + 1) pos_text = np.expand_dims(pos_text, axis=0) text = dg.to_variable(text).astype(np.int64) pos_text = dg.to_variable(pos_text).astype(np.int64) _, mel_output_postnet = model(text, pos_text, alpha=args.alpha) if args.vocoder == 'griffin-lim': #synthesis use griffin-lim wav = synthesis_with_griffinlim(mel_output_postnet, cfg['audio']) elif args.vocoder == 'waveflow': wav = synthesis_with_waveflow(mel_output_postnet, args, args.checkpoint_vocoder, place) else: print( 'vocoder error, we only support griffinlim and waveflow, but recevied %s.' % args.vocoder) writer.add_audio(text_input + '(' + args.vocoder + ')', wav, 0, cfg['audio']['sr']) if not os.path.exists(os.path.join(args.output, 'samples')): os.mkdir(os.path.join(args.output, 'samples')) write( os.path.join( os.path.join(args.output, 'samples'), args.vocoder + '.wav'), cfg['audio']['sr'], wav) print("Synthesis completed !!!") writer.close()
def alignments(args): local_rank = dg.parallel.Env().local_rank place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()) with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) with dg.guard(place): network_cfg = cfg['network'] model = TransformerTTS( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], cfg['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) # Load parameters. global_step = io.load_parameters( model=model, checkpoint_path=args.checkpoint_transformer) model.eval() # get text data root = Path(args.data) csv_path = root.joinpath("metadata.csv") table = pd.read_csv(csv_path, sep="|", header=None, quoting=csv.QUOTE_NONE, names=["fname", "raw_text", "normalized_text"]) ljspeech_processor = audio.AudioProcessor( sample_rate=cfg['audio']['sr'], num_mels=cfg['audio']['num_mels'], min_level_db=cfg['audio']['min_level_db'], ref_level_db=cfg['audio']['ref_level_db'], n_fft=cfg['audio']['n_fft'], win_length=cfg['audio']['win_length'], hop_length=cfg['audio']['hop_length'], power=cfg['audio']['power'], preemphasis=cfg['audio']['preemphasis'], signal_norm=True, symmetric_norm=False, max_norm=1., mel_fmin=0, mel_fmax=None, clip_norm=True, griffin_lim_iters=60, do_trim_silence=False, sound_norm=False) pbar = tqdm(range(len(table))) alignments = OrderedDict() for i in pbar: fname, raw_text, normalized_text = table.iloc[i] # init input text = np.asarray(text_to_sequence(normalized_text)) text = fluid.layers.unsqueeze(dg.to_variable(text), [0]) pos_text = np.arange(1, text.shape[1] + 1) pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0]) wav = ljspeech_processor.load_wav( os.path.join(args.data, 'wavs', fname + ".wav")) mel_input = ljspeech_processor.melspectrogram(wav).astype( np.float32) mel_input = np.transpose(mel_input, axes=(1, 0)) mel_input = fluid.layers.unsqueeze(dg.to_variable(mel_input), [0]) mel_lens = mel_input.shape[1] dec_slf_mask = get_triu_tensor(mel_input, mel_input).astype(np.float32) dec_slf_mask = np.expand_dims(dec_slf_mask, axis=0) dec_slf_mask = fluid.layers.cast(dg.to_variable(dec_slf_mask != 0), np.float32) * (-2**32 + 1) pos_mel = np.arange(1, mel_input.shape[1] + 1) pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0]) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( text, mel_input, pos_text, pos_mel, dec_slf_mask) mel_input = fluid.layers.concat( [mel_input, postnet_pred[:, -1:, :]], axis=1) alignment, _ = get_alignment(attn_probs, mel_lens, network_cfg['decoder_num_head']) alignments[fname] = alignment with open(args.output + '.txt', "wb") as f: pickle.dump(alignments, f)
n_loop = model_config["n_loop"] n_layer = model_config["n_layer"] residual_channels = model_config["residual_channels"] output_dim = model_config["output_dim"] loss_type = model_config["loss_type"] log_scale_min = model_config["log_scale_min"] decoder = WaveNet(n_loop, n_layer, residual_channels, output_dim, n_mels, filter_size, loss_type, log_scale_min) model = ConditionalWavenet(encoder, decoder) summary(model) # load model parameters checkpoint_dir = os.path.join(args.output, "checkpoints") if args.checkpoint: iteration = io.load_parameters(model, checkpoint_path=args.checkpoint) else: iteration = io.load_parameters(model, checkpoint_dir=checkpoint_dir, iteration=args.iteration) assert iteration > 0, "A trained model is needed." # WARNING: don't forget to remove weight norm to re-compute each wrapped layer's weight # removing weight norm also speeds up computation for layer in model.sublayers(): if isinstance(layer, WeightNormWrapper): layer.remove_weight_norm() train_loader = fluid.io.DataLoader.from_generator(capacity=10, return_list=True) train_loader.set_batch_generator(train_cargo, place)
def main(args): local_rank = dg.parallel.Env().local_rank nranks = dg.parallel.Env().nranks parallel = nranks > 1 with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace() if not os.path.exists(args.output): os.mkdir(args.output) writer = LogWriter(os.path.join(args.output, 'log')) if local_rank == 0 else None fluid.enable_dygraph(place) model = Vocoder(cfg['train']['batch_size'], cfg['vocoder']['hidden_size'], cfg['audio']['num_mels'], cfg['audio']['n_fft']) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay( 1 / (cfg['train']['warm_up_step'] * (cfg['train']['learning_rate']**2)), cfg['train']['warm_up_step']), parameter_list=model.parameters(), grad_clip=fluid.clip.GradientClipByGlobalNorm( cfg['train']['grad_clip_thresh'])) # Load parameters. global_step = io.load_parameters(model=model, optimizer=optimizer, checkpoint_dir=os.path.join( args.output, 'checkpoints'), iteration=args.iteration, checkpoint_path=args.checkpoint) print("Rank {}: checkpoint loaded.".format(local_rank)) if parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) reader = LJSpeechLoader(cfg['audio'], place, args.data, cfg['train']['batch_size'], nranks, local_rank, is_vocoder=True).reader() for epoch in range(cfg['train']['max_iteration']): pbar = tqdm(reader) for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d' % epoch) mel, mag = data mag = dg.to_variable(mag.numpy()) mel = dg.to_variable(mel.numpy()) global_step += 1 mag_pred = model(mel) loss = layers.mean( layers.abs(layers.elementwise_sub(mag_pred, mag))) if parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() optimizer.minimize(loss) model.clear_gradients() if local_rank == 0: writer.add_scalar('training_loss/loss', loss.numpy(), global_step) # save checkpoint if local_rank == 0 and global_step % cfg['train'][ 'checkpoint_interval'] == 0: io.save_parameters(os.path.join(args.output, 'checkpoints'), global_step, model, optimizer) if local_rank == 0: writer.close()
def main(args): local_rank = dg.parallel.Env().local_rank nranks = dg.parallel.Env().nranks parallel = nranks > 1 with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = fluid.CUDAPlace(dg.parallel.Env() .dev_id) if args.use_gpu else fluid.CPUPlace() fluid.enable_dygraph(place) if not os.path.exists(args.output): os.mkdir(args.output) writer = SummaryWriter(os.path.join(args.output, 'log')) if local_rank == 0 else None model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels']) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay(1 / (cfg['train']['warm_up_step'] * (cfg['train']['learning_rate']**2)), cfg['train']['warm_up_step']), parameter_list=model.parameters(), grad_clip=fluid.clip.GradientClipByGlobalNorm(cfg['train'][ 'grad_clip_thresh'])) reader = LJSpeechLoader( cfg['audio'], place, args.data, args.alignments_path, cfg['train']['batch_size'], nranks, local_rank, shuffle=True).reader iterator = iter(tqdm(reader)) # Load parameters. global_step = io.load_parameters( model=model, optimizer=optimizer, checkpoint_dir=os.path.join(args.output, 'checkpoints'), iteration=args.iteration, checkpoint_path=args.checkpoint) print("Rank {}: checkpoint loaded.".format(local_rank)) if parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) while global_step <= cfg['train']['max_iteration']: try: batch = next(iterator) except StopIteration as e: iterator = iter(tqdm(reader)) batch = next(iterator) (character, mel, pos_text, pos_mel, alignment) = batch global_step += 1 #Forward result = model( character, pos_text, mel_pos=pos_mel, length_target=alignment) mel_output, mel_output_postnet, duration_predictor_output, _, _ = result mel_loss = layers.mse_loss(mel_output, mel) mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel) duration_loss = layers.mean( layers.abs( layers.elementwise_sub(duration_predictor_output, alignment))) total_loss = mel_loss + mel_postnet_loss + duration_loss if local_rank == 0: writer.add_scalar('mel_loss', mel_loss.numpy(), global_step) writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step) writer.add_scalar('duration_loss', duration_loss.numpy(), global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) if parallel: total_loss = model.scale_loss(total_loss) total_loss.backward() model.apply_collective_grads() else: total_loss.backward() optimizer.minimize(total_loss) model.clear_gradients() # save checkpoint if local_rank == 0 and global_step % cfg['train'][ 'checkpoint_interval'] == 0: io.save_parameters( os.path.join(args.output, 'checkpoints'), global_step, model, optimizer) if local_rank == 0: writer.close()
def _initialize(self): """ initialize with the necessary elements """ self.tts_checkpoint_path = os.path.join(self.directory, "assets", "tts", "step-1780000") self.waveflow_checkpoint_path = os.path.join(self.directory, "assets", "vocoder", "step-2000000") self.waveflow_config_path = os.path.join(self.directory, "assets", "vocoder", "waveflow_ljspeech.yaml") tts_checkpoint_path = os.path.join(self.directory, "assets", "tts", "ljspeech.yaml") with open(tts_checkpoint_path) as f: self.tts_config = ruamel.yaml.safe_load(f) with fluid.dygraph.guard(fluid.CPUPlace()): char_embedding = dg.Embedding( (en.n_vocab, self.tts_config["char_dim"])) multi_speaker = self.tts_config["n_speakers"] > 1 speaker_embedding = dg.Embedding((self.tts_config["n_speakers"], self.tts_config["speaker_dim"])) \ if multi_speaker else None encoder = Encoder(self.tts_config["encoder_layers"], self.tts_config["char_dim"], self.tts_config["encoder_dim"], self.tts_config["kernel_size"], has_bias=multi_speaker, bias_dim=self.tts_config["speaker_dim"], keep_prob=1.0 - self.tts_config["dropout"]) decoder = Decoder( self.tts_config["n_mels"], self.tts_config["reduction_factor"], list(self.tts_config["prenet_sizes"]) + [self.tts_config["char_dim"]], self.tts_config["decoder_layers"], self.tts_config["kernel_size"], self.tts_config["attention_dim"], position_encoding_weight=self.tts_config["position_weight"], omega=self.tts_config["position_rate"], has_bias=multi_speaker, bias_dim=self.tts_config["speaker_dim"], keep_prob=1.0 - self.tts_config["dropout"]) postnet = PostNet(self.tts_config["postnet_layers"], self.tts_config["char_dim"], self.tts_config["postnet_dim"], self.tts_config["kernel_size"], self.tts_config["n_mels"], self.tts_config["reduction_factor"], has_bias=multi_speaker, bias_dim=self.tts_config["speaker_dim"], keep_prob=1.0 - self.tts_config["dropout"]) self.tts_model = SpectraNet(char_embedding, speaker_embedding, encoder, decoder, postnet) io.load_parameters(model=self.tts_model, checkpoint_path=self.tts_checkpoint_path) for name, layer in self.tts_model.named_sublayers(): try: remove_weight_norm(layer) except ValueError: # this layer has not weight norm hook pass self.waveflow = WaveflowVocoder( config_path=self.waveflow_config_path, checkpoint_path=self.waveflow_checkpoint_path) self.griffin = GriffinLimVocoder( sharpening_factor=self.tts_config["sharpening_factor"], sample_rate=self.tts_config["sample_rate"], n_fft=self.tts_config["n_fft"], win_length=self.tts_config["win_length"], hop_length=self.tts_config["hop_length"])
writer = None sentences = [ "Scientists at the CERN laboratory say they have discovered a new particle.", "There's a way to measure the acute emotional intelligence that has never gone out of style.", "President Trump met with other leaders at the Group of 20 conference.", "Generative adversarial network or variational auto-encoder.", "Please call Stella.", "Some have accepted this as a miracle without any physical explanation.", ] evaluator = make_evaluator(config, sentences, eval_dir, writer) state_saver = make_state_saver(config, state_dir, writer) # load parameters and optimizer, and opdate iterations done sofar if args.checkpoint is not None: iteration = load_parameters(model, optim, checkpoint_path=args.checkpoint) else: iteration = load_parameters(model, optim, checkpoint_dir=ckpt_dir, iteration=args.iteration) # =========================train========================= train_config = config["train"] max_iter = train_config["max_iteration"] snap_interval = train_config["snap_interval"] save_interval = train_config["save_interval"] eval_interval = train_config["eval_interval"] global_step = iteration + 1
# =========================link(dataloader, paddle)========================= loader = fluid.io.DataLoader.from_generator( capacity=10, return_list=True) loader.set_batch_generator(ljspeech_loader, places=place) # tensorboard & checkpoint preparation output_dir = args.output ckpt_dir = os.path.join(output_dir, "checkpoints") log_dir = os.path.join(output_dir, "log") state_dir = os.path.join(output_dir, "states") make_output_tree(output_dir) writer = SummaryWriter(logdir=log_dir) # load parameters and optimizer, and opdate iterations done sofar if args.checkpoint is not None: iteration = io.load_parameters( dv3, optim, checkpoint_path=args.checkpoint) else: iteration = io.load_parameters( dv3, optim, checkpoint_dir=ckpt_dir, iteration=args.iteration) # =========================train========================= max_iter = train_config["max_iteration"] snap_interval = train_config["snap_interval"] save_interval = train_config["save_interval"] eval_interval = train_config["eval_interval"] global_step = iteration + 1 iterator = iter(tqdm.tqdm(loader)) while global_step <= max_iter: try: batch = next(iterator)
def synthesis(text_input, args): local_rank = dg.parallel.Env().local_rank place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()) with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) # tensorboard if not os.path.exists(args.output): os.mkdir(args.output) writer = SummaryWriter(os.path.join(args.output, 'log')) fluid.enable_dygraph(place) with fluid.unique_name.guard(): network_cfg = cfg['network'] model = TransformerTTS( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], cfg['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) # Load parameters. global_step = io.load_parameters( model=model, checkpoint_path=args.checkpoint_transformer) model.eval() with fluid.unique_name.guard(): model_vocoder = Vocoder(cfg['train']['batch_size'], cfg['vocoder']['hidden_size'], cfg['audio']['num_mels'], cfg['audio']['n_fft']) # Load parameters. global_step = io.load_parameters( model=model_vocoder, checkpoint_path=args.checkpoint_vocoder) model_vocoder.eval() # init input text = np.asarray(text_to_sequence(text_input)) text = fluid.layers.unsqueeze(dg.to_variable(text), [0]) mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32) pos_text = np.arange(1, text.shape[1] + 1) pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0]) pbar = tqdm(range(args.max_len)) for i in pbar: pos_mel = np.arange(1, mel_input.shape[1] + 1) pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0]) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( text, mel_input, pos_text, pos_mel) mel_input = fluid.layers.concat([mel_input, postnet_pred[:, -1:, :]], axis=1) mag_pred = model_vocoder(postnet_pred) _ljspeech_processor = audio.AudioProcessor( sample_rate=cfg['audio']['sr'], num_mels=cfg['audio']['num_mels'], min_level_db=cfg['audio']['min_level_db'], ref_level_db=cfg['audio']['ref_level_db'], n_fft=cfg['audio']['n_fft'], win_length=cfg['audio']['win_length'], hop_length=cfg['audio']['hop_length'], power=cfg['audio']['power'], preemphasis=cfg['audio']['preemphasis'], signal_norm=True, symmetric_norm=False, max_norm=1., mel_fmin=0, mel_fmax=None, clip_norm=True, griffin_lim_iters=60, do_trim_silence=False, sound_norm=False) # synthesis with cbhg wav = _ljspeech_processor.inv_spectrogram( fluid.layers.transpose(fluid.layers.squeeze(mag_pred, [0]), [1, 0]).numpy()) global_step = 0 for i, prob in enumerate(attn_probs): for j in range(4): x = np.uint8(cm.viridis(prob.numpy()[j]) * 255) writer.add_image('Attention_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") writer.add_audio(text_input + '(cbhg)', wav, 0, cfg['audio']['sr']) if not os.path.exists(os.path.join(args.output, 'samples')): os.mkdir(os.path.join(args.output, 'samples')) write(os.path.join(os.path.join(args.output, 'samples'), 'cbhg.wav'), cfg['audio']['sr'], wav) # synthesis with griffin-lim wav = _ljspeech_processor.inv_melspectrogram( fluid.layers.transpose(fluid.layers.squeeze(postnet_pred, [0]), [1, 0]).numpy()) writer.add_audio(text_input + '(griffin)', wav, 0, cfg['audio']['sr']) write(os.path.join(os.path.join(args.output, 'samples'), 'griffin.wav'), cfg['audio']['sr'], wav) print("Synthesis completed !!!") writer.close()
def synthesis_with_clarinet(config_path, checkpoint, mel_spectrogram, place): with open(config_path, 'rt') as f: config = yaml.safe_load(f) data_config = config["data"] n_mels = data_config["n_mels"] teacher_config = config["teacher"] n_loop = teacher_config["n_loop"] n_layer = teacher_config["n_layer"] filter_size = teacher_config["filter_size"] # only batch=1 for validation is enabled with dg.guard(place): # conditioner(upsampling net) conditioner_config = config["conditioner"] upsampling_factors = conditioner_config["upsampling_factors"] upsample_net = UpsampleNet(upscale_factors=upsampling_factors) freeze(upsample_net) residual_channels = teacher_config["residual_channels"] loss_type = teacher_config["loss_type"] output_dim = teacher_config["output_dim"] log_scale_min = teacher_config["log_scale_min"] assert loss_type == "mog" and output_dim == 3, \ "the teacher wavenet should be a wavenet with single gaussian output" teacher = WaveNet(n_loop, n_layer, residual_channels, output_dim, n_mels, filter_size, loss_type, log_scale_min) # load & freeze upsample_net & teacher freeze(teacher) student_config = config["student"] n_loops = student_config["n_loops"] n_layers = student_config["n_layers"] student_residual_channels = student_config["residual_channels"] student_filter_size = student_config["filter_size"] student_log_scale_min = student_config["log_scale_min"] student = ParallelWaveNet(n_loops, n_layers, student_residual_channels, n_mels, student_filter_size) stft_config = config["stft"] stft = STFT(n_fft=stft_config["n_fft"], hop_length=stft_config["hop_length"], win_length=stft_config["win_length"]) lmd = config["loss"]["lmd"] model = Clarinet(upsample_net, teacher, student, stft, student_log_scale_min, lmd) io.load_parameters(model=model, checkpoint_path=checkpoint) if not os.path.exists(args.output): os.makedirs(args.output) model.eval() # Rescale mel_spectrogram. min_level, ref_level = 1e-5, 20 # hard code it mel_spectrogram = 20 * np.log10(np.maximum(min_level, mel_spectrogram)) mel_spectrogram = mel_spectrogram - ref_level mel_spectrogram = np.clip((mel_spectrogram + 100) / 100, 0, 1) mel_spectrogram = dg.to_variable(mel_spectrogram) mel_spectrogram = fluid.layers.transpose(mel_spectrogram, [0, 2, 1]) wav_var = model.synthesis(mel_spectrogram) wav_np = wav_var.numpy()[0] return wav_np
def synthesis(text_input, args): local_rank = dg.parallel.Env().local_rank place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()) with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) # tensorboard if not os.path.exists(args.output): os.mkdir(args.output) writer = SummaryWriter(os.path.join(args.output, 'log')) fluid.enable_dygraph(place) with fluid.unique_name.guard(): network_cfg = cfg['network'] model = TransformerTTS( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], cfg['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) # Load parameters. global_step = io.load_parameters( model=model, checkpoint_path=args.checkpoint_transformer) model.eval() # init input text = np.asarray(text_to_sequence(text_input)) text = fluid.layers.unsqueeze(dg.to_variable(text).astype(np.int64), [0]) mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32) pos_text = np.arange(1, text.shape[1] + 1) pos_text = fluid.layers.unsqueeze( dg.to_variable(pos_text).astype(np.int64), [0]) for i in range(args.max_len): pos_mel = np.arange(1, mel_input.shape[1] + 1) pos_mel = fluid.layers.unsqueeze( dg.to_variable(pos_mel).astype(np.int64), [0]) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( text, mel_input, pos_text, pos_mel) if stop_preds.numpy()[0, -1] > args.stop_threshold: break mel_input = fluid.layers.concat([mel_input, postnet_pred[:, -1:, :]], axis=1) global_step = 0 for i, prob in enumerate(attn_probs): for j in range(4): x = np.uint8(cm.viridis(prob.numpy()[j]) * 255) writer.add_image('Attention_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") if args.vocoder == 'griffin-lim': #synthesis use griffin-lim wav = synthesis_with_griffinlim(postnet_pred, cfg['audio']) elif args.vocoder == 'waveflow': # synthesis use waveflow wav = synthesis_with_waveflow(postnet_pred, args, args.checkpoint_vocoder, place) else: print( 'vocoder error, we only support griffinlim and waveflow, but recevied %s.' % args.vocoder) writer.add_audio(text_input + '(' + args.vocoder + ')', wav, 0, cfg['audio']['sr']) if not os.path.exists(os.path.join(args.output, 'samples')): os.mkdir(os.path.join(args.output, 'samples')) write( os.path.join(os.path.join(args.output, 'samples'), args.vocoder + '.wav'), cfg['audio']['sr'], wav) print("Synthesis completed !!!") writer.close()
def alignments(args): local_rank = dg.parallel.Env().local_rank place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()) with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) with dg.guard(place): network_cfg = cfg['network'] model = TransformerTTS( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], cfg['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) # Load parameters. global_step = io.load_parameters( model=model, checkpoint_path=args.checkpoint_transformer) model.eval() # get text data root = Path(args.data) csv_path = root.joinpath("metadata.csv") table = pd.read_csv( csv_path, sep="|", header=None, quoting=csv.QUOTE_NONE, names=["fname", "raw_text", "normalized_text"]) pbar = tqdm(range(len(table))) alignments = OrderedDict() for i in pbar: fname, raw_text, normalized_text = table.iloc[i] # init input text = np.asarray(text_to_sequence(normalized_text)) text = fluid.layers.unsqueeze(dg.to_variable(text), [0]) pos_text = np.arange(1, text.shape[1] + 1) pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0]) # load wav, _ = librosa.load( str(os.path.join(args.data, 'wavs', fname + ".wav"))) spec = librosa.stft( y=wav, n_fft=cfg['audio']['n_fft'], win_length=cfg['audio']['win_length'], hop_length=cfg['audio']['hop_length']) mag = np.abs(spec) mel = librosa.filters.mel(sr=cfg['audio']['sr'], n_fft=cfg['audio']['n_fft'], n_mels=cfg['audio']['num_mels'], fmin=cfg['audio']['fmin'], fmax=cfg['audio']['fmax']) mel = np.matmul(mel, mag) mel = np.log(np.maximum(mel, 1e-5)) mel_input = np.transpose(mel, axes=(1, 0)) mel_input = fluid.layers.unsqueeze(dg.to_variable(mel_input), [0]) mel_lens = mel_input.shape[1] pos_mel = np.arange(1, mel_input.shape[1] + 1) pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0]) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( text, mel_input, pos_text, pos_mel) mel_input = fluid.layers.concat( [mel_input, postnet_pred[:, -1:, :]], axis=1) alignment, _ = get_alignment(attn_probs, mel_lens, network_cfg['decoder_num_head']) alignments[fname] = alignment with open(args.output + '.pkl', "wb") as f: pickle.dump(alignments, f)
def main(args): local_rank = dg.parallel.Env().local_rank nranks = dg.parallel.Env().nranks parallel = nranks > 1 with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace() if not os.path.exists(args.output): os.mkdir(args.output) writer = LogWriter(os.path.join(args.output, 'log')) if local_rank == 0 else None fluid.enable_dygraph(place) network_cfg = cfg['network'] model = TransformerTTS( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], cfg['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay(1 / (cfg['train']['warm_up_step'] * (cfg['train']['learning_rate']**2)), cfg['train']['warm_up_step']), parameter_list=model.parameters(), grad_clip=fluid.clip.GradientClipByGlobalNorm(cfg['train'][ 'grad_clip_thresh'])) # Load parameters. global_step = io.load_parameters( model=model, optimizer=optimizer, checkpoint_dir=os.path.join(args.output, 'checkpoints'), iteration=args.iteration, checkpoint_path=args.checkpoint) print("Rank {}: checkpoint loaded.".format(local_rank)) if parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) reader = LJSpeechLoader( cfg['audio'], place, args.data, cfg['train']['batch_size'], nranks, local_rank, shuffle=True).reader iterator = iter(tqdm(reader)) global_step += 1 while global_step <= cfg['train']['max_iteration']: try: batch = next(iterator) except StopIteration as e: iterator = iter(tqdm(reader)) batch = next(iterator) character, mel, mel_input, pos_text, pos_mel, stop_tokens = batch mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( character, mel_input, pos_text, pos_mel) mel_loss = layers.mean( layers.abs(layers.elementwise_sub(mel_pred, mel))) post_mel_loss = layers.mean( layers.abs(layers.elementwise_sub(postnet_pred, mel))) loss = mel_loss + post_mel_loss stop_loss = cross_entropy( stop_preds, stop_tokens, weight=cfg['network']['stop_loss_weight']) loss = loss + stop_loss if local_rank == 0: writer.add_scalar('training_loss/mel_loss', mel_loss.numpy(), global_step) writer.add_scalar('training_loss/post_mel_loss', post_mel_loss.numpy(), global_step) writer.add_scalar('stop_loss', stop_loss.numpy(), global_step) if parallel: writer.add_scalar('alphas/encoder_alpha', model._layers.encoder.alpha.numpy(), global_step) writer.add_scalar('alphas/decoder_alpha', model._layers.decoder.alpha.numpy(), global_step) else: writer.add_scalar('alphas/encoder_alpha', model.encoder.alpha.numpy(), global_step) writer.add_scalar('alphas/decoder_alpha', model.decoder.alpha.numpy(), global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) if global_step % cfg['train']['image_interval'] == 1: for i, prob in enumerate(attn_probs): for j in range(cfg['network']['decoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[j * cfg['train'][ 'batch_size'] // nranks]) * 255) writer.add_image( 'Attention_%d_0' % global_step, x, i * 4 + j) for i, prob in enumerate(attn_enc): for j in range(cfg['network']['encoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[j * cfg['train'][ 'batch_size'] // nranks]) * 255) writer.add_image( 'Attention_enc_%d_0' % global_step, x, i * 4 + j) for i, prob in enumerate(attn_dec): for j in range(cfg['network']['decoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[j * cfg['train'][ 'batch_size'] // nranks]) * 255) writer.add_image( 'Attention_dec_%d_0' % global_step, x, i * 4 + j) if parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() optimizer.minimize(loss) model.clear_gradients() # save checkpoint if local_rank == 0 and global_step % cfg['train'][ 'checkpoint_interval'] == 0: io.save_parameters( os.path.join(args.output, 'checkpoints'), global_step, model, optimizer) global_step += 1 if local_rank == 0: writer.close()
def main(args): local_rank = dg.parallel.Env().local_rank nranks = dg.parallel.Env().nranks parallel = nranks > 1 with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) global_step = 0 place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace() if not os.path.exists(args.output): os.mkdir(args.output) writer = SummaryWriter(os.path.join(args.output, 'log')) if local_rank == 0 else None fluid.enable_dygraph(place) network_cfg = cfg['network'] model = TransformerTTS( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], cfg['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) model.train() optimizer = fluid.optimizer.AdamOptimizer( learning_rate=dg.NoamDecay( 1 / (cfg['train']['warm_up_step'] * (cfg['train']['learning_rate']**2)), cfg['train']['warm_up_step']), parameter_list=model.parameters(), grad_clip=fluid.clip.GradientClipByGlobalNorm( cfg['train']['grad_clip_thresh'])) # Load parameters. global_step = io.load_parameters(model=model, optimizer=optimizer, checkpoint_dir=os.path.join( args.output, 'checkpoints'), iteration=args.iteration, checkpoint_path=args.checkpoint) print("Rank {}: checkpoint loaded.".format(local_rank)) if parallel: strategy = dg.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) reader = LJSpeechLoader(cfg['audio'], place, args.data, cfg['train']['batch_size'], nranks, local_rank, shuffle=True).reader() for epoch in range(cfg['train']['max_epochs']): pbar = tqdm(reader) for i, data in enumerate(pbar): pbar.set_description('Processing at epoch %d' % epoch) character, mel, mel_input, pos_text, pos_mel = data global_step += 1 mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( character, mel_input, pos_text, pos_mel) mel_loss = layers.mean( layers.abs(layers.elementwise_sub(mel_pred, mel))) post_mel_loss = layers.mean( layers.abs(layers.elementwise_sub(postnet_pred, mel))) loss = mel_loss + post_mel_loss # Note: When used stop token loss the learning did not work. if cfg['network']['stop_token']: label = (pos_mel == 0).astype(np.float32) stop_loss = cross_entropy(stop_preds, label) loss = loss + stop_loss if local_rank == 0: writer.add_scalars( 'training_loss', { 'mel_loss': mel_loss.numpy(), 'post_mel_loss': post_mel_loss.numpy() }, global_step) if cfg['network']['stop_token']: writer.add_scalar('stop_loss', stop_loss.numpy(), global_step) if parallel: writer.add_scalars( 'alphas', { 'encoder_alpha': model._layers.encoder.alpha.numpy(), 'decoder_alpha': model._layers.decoder.alpha.numpy(), }, global_step) else: writer.add_scalars( 'alphas', { 'encoder_alpha': model.encoder.alpha.numpy(), 'decoder_alpha': model.decoder.alpha.numpy(), }, global_step) writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step) if global_step % cfg['train']['image_interval'] == 1: for i, prob in enumerate(attn_probs): for j in range(cfg['network']['decoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[ j * cfg['train']['batch_size'] // 2]) * 255) writer.add_image('Attention_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") for i, prob in enumerate(attn_enc): for j in range(cfg['network']['encoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[ j * cfg['train']['batch_size'] // 2]) * 255) writer.add_image('Attention_enc_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") for i, prob in enumerate(attn_dec): for j in range(cfg['network']['decoder_num_head']): x = np.uint8( cm.viridis(prob.numpy()[ j * cfg['train']['batch_size'] // 2]) * 255) writer.add_image('Attention_dec_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") if parallel: loss = model.scale_loss(loss) loss.backward() model.apply_collective_grads() else: loss.backward() optimizer.minimize(loss) model.clear_gradients() # save checkpoint if local_rank == 0 and global_step % cfg['train'][ 'checkpoint_interval'] == 0: io.save_parameters(os.path.join(args.output, 'checkpoints'), global_step, model, optimizer) if local_rank == 0: writer.close()