def process_a_sentence(self, model, text): text = np.array( en.text_to_sequence( text, p=self.p_replace), dtype=np.int64) length = len(text) text_positions = np.arange(1, 1 + length) text = np.expand_dims(text, 0) text_positions = np.expand_dims(text_positions, 0) model.eval() if isinstance(model, dg.DataParallel): _model = model._layers else: _model = model mel_outputs, linear_outputs, alignments, done = _model.transduce( dg.to_variable(text), dg.to_variable(text_positions)) linear_outputs_np = linear_outputs.numpy()[0].T # (C, T) wav = spec_to_waveform(linear_outputs_np, self.min_level_db, self.ref_level_db, self.power, self.n_iter, self.win_length, self.hop_length, self.preemphasis) alignments_np = alignments.numpy()[0] # batch_size = 1 return wav, alignments_np
def __call__(self, examples): """ output shape and dtype (B, T_text) int64 (B,) int64 (B, T_frame, C_spec) float32 (B, T_frame, C_mel) float32 (B,) int64 """ text_seqs = [] specs = [] mels = [] num_frames = np.array([example[3] for example in examples], dtype=np.int64) max_frames = np.max(num_frames) for example in examples: text, spec, mel, _ = example text_seqs.append(en.text_to_sequence(text, self.p_pronunciation)) # if max_frames - mel.shape[0] < 0: # import pdb; pdb.set_trace() specs.append( np.pad(spec, [(0, max_frames - spec.shape[0]), (0, 0)])) mels.append(np.pad(mel, [(0, max_frames - mel.shape[0]), (0, 0)])) specs = np.stack(specs) mels = np.stack(mels) text_lengths = np.array([len(seq) for seq in text_seqs], dtype=np.int64) max_length = np.max(text_lengths) text_seqs = np.array( [seq + [0] * (max_length - len(seq)) for seq in text_seqs], dtype=np.int64) return text_seqs, text_lengths, specs, mels, num_frames
def synthesis(text_input, args): local_rank = dg.parallel.Env().local_rank place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()) fluid.enable_dygraph(place) with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) # tensorboard if not os.path.exists(args.output): os.mkdir(args.output) writer = SummaryWriter(os.path.join(args.output, 'log')) model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels']) # Load parameters. global_step = io.load_parameters(model=model, checkpoint_path=args.checkpoint) model.eval() text = np.asarray(text_to_sequence(text_input)) text = np.expand_dims(text, axis=0) pos_text = np.arange(1, text.shape[1] + 1) pos_text = np.expand_dims(pos_text, axis=0) text = dg.to_variable(text) pos_text = dg.to_variable(pos_text) _, mel_output_postnet = model(text, pos_text, alpha=args.alpha) result = np.exp(mel_output_postnet.numpy()) mel_output_postnet = fluid.layers.transpose( fluid.layers.squeeze(mel_output_postnet, [0]), [1, 0]) mel_output_postnet = np.exp(mel_output_postnet.numpy()) basis = librosa.filters.mel(cfg['audio']['sr'], cfg['audio']['n_fft'], cfg['audio']['num_mels']) inv_basis = np.linalg.pinv(basis) spec = np.maximum(1e-10, np.dot(inv_basis, mel_output_postnet)) # synthesis use clarinet wav_clarinet = synthesis_with_clarinet(args.config_clarinet, args.checkpoint_clarinet, result, place) writer.add_audio(text_input + '(clarinet)', wav_clarinet, 0, cfg['audio']['sr']) if not os.path.exists(os.path.join(args.output, 'samples')): os.mkdir(os.path.join(args.output, 'samples')) write(os.path.join(os.path.join(args.output, 'samples'), 'clarinet.wav'), cfg['audio']['sr'], wav_clarinet) #synthesis use griffin-lim wav = librosa.core.griffinlim(spec**cfg['audio']['power'], hop_length=cfg['audio']['hop_length'], win_length=cfg['audio']['win_length']) writer.add_audio(text_input + '(griffin-lim)', wav, 0, cfg['audio']['sr']) write( os.path.join(os.path.join(args.output, 'samples'), 'grinffin-lim.wav'), cfg['audio']['sr'], wav) print("Synthesis completed !!!") writer.close()
def synthesize(args, config, model, vocoder, sentence, monotonic_layers): print("[synthesize] {}".format(sentence)) text = en.text_to_sequence(sentence, p=1.0) text = np.expand_dims(np.array(text, dtype="int64"), 0) lengths = np.array([text.size], dtype=np.int64) text_seqs = dg.to_variable(text) text_lengths = dg.to_variable(lengths) decoder_layers = config["decoder_layers"] force_monotonic_attention = [False] * decoder_layers for i in monotonic_layers: force_monotonic_attention[i] = True with dg.no_grad(): outputs = model(text_seqs, text_lengths, speakers=None, force_monotonic_attention=force_monotonic_attention, window=(config["backward_step"], config["forward_step"])) decoded, refined, attentions = outputs if args.vocoder == "griffin-lim": wav_np = vocoder(refined.numpy()[0].T) else: wav = vocoder(F.transpose(refined, (0, 2, 1))) wav_np = wav.numpy()[0] return wav_np
def synthesis(text_input, args): local_rank = dg.parallel.Env().local_rank place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()) fluid.enable_dygraph(place) with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) # tensorboard if not os.path.exists(args.output): os.mkdir(args.output) writer = SummaryWriter(os.path.join(args.output, 'log')) model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels']) # Load parameters. global_step = io.load_parameters( model=model, checkpoint_path=args.checkpoint) model.eval() text = np.asarray(text_to_sequence(text_input)) text = np.expand_dims(text, axis=0) pos_text = np.arange(1, text.shape[1] + 1) pos_text = np.expand_dims(pos_text, axis=0) text = dg.to_variable(text).astype(np.int64) pos_text = dg.to_variable(pos_text).astype(np.int64) _, mel_output_postnet = model(text, pos_text, alpha=args.alpha) if args.vocoder == 'griffin-lim': #synthesis use griffin-lim wav = synthesis_with_griffinlim(mel_output_postnet, cfg['audio']) elif args.vocoder == 'waveflow': wav = synthesis_with_waveflow(mel_output_postnet, args, args.checkpoint_vocoder, place) else: print( 'vocoder error, we only support griffinlim and waveflow, but recevied %s.' % args.vocoder) writer.add_audio(text_input + '(' + args.vocoder + ')', wav, 0, cfg['audio']['sr']) if not os.path.exists(os.path.join(args.output, 'samples')): os.mkdir(os.path.join(args.output, 'samples')) write( os.path.join( os.path.join(args.output, 'samples'), args.vocoder + '.wav'), cfg['audio']['sr'], wav) print("Synthesis completed !!!") writer.close()
def __call__(self, in_data): fname, _, normalized_text = in_data # text processing mix_grapheme_phonemes = text_to_sequence( normalized_text, self.replace_pronounciation_prob) text_length = len(mix_grapheme_phonemes) # CAUTION: positions start from 1 speaker_id = None # wave processing wav, _ = librosa.load(fname, sr=self.sample_rate) # preemphasis y = signal.lfilter([1., -self.preemphasis], [1.], wav) # STFT D = librosa.stft(y=y, n_fft=self.n_fft, win_length=self.win_length, hop_length=self.hop_length) S = np.abs(D) # to db and normalize to 0-1 amplitude_min = np.exp(self.min_level_db / 20 * np.log(10)) # 1e-5 S_norm = 20 * np.log10(np.maximum(amplitude_min, S)) - self.ref_level_db S_norm = (S_norm - self.min_level_db) / (-self.min_level_db) S_norm = self.max_norm * S_norm if self.clip_norm: S_norm = np.clip(S_norm, 0, self.max_norm) # mel scale and to db and normalize to 0-1, # CAUTION: pass linear scale S, not dbscaled S S_mel = librosa.feature.melspectrogram(S=S, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax, power=1.) S_mel = 20 * np.log10(np.maximum(amplitude_min, S_mel)) - self.ref_level_db S_mel_norm = (S_mel - self.min_level_db) / (-self.min_level_db) S_mel_norm = self.max_norm * S_mel_norm if self.clip_norm: S_mel_norm = np.clip(S_mel_norm, 0, self.max_norm) # num_frames n_frames = S_mel_norm.shape[-1] # CAUTION: original number of frames return (mix_grapheme_phonemes, text_length, speaker_id, S_norm, S_mel_norm, n_frames)
def eval_model(model, text, replace_pronounciation_prob, min_level_db, ref_level_db, power, n_iter, win_length, hop_length, preemphasis): """generate waveform from text using a deepvoice 3 model""" text = np.array(en.text_to_sequence(text, p=replace_pronounciation_prob), dtype=np.int64) length = len(text) print("text sequence's length: {}".format(length)) text_positions = np.arange(1, 1 + length) text = np.expand_dims(text, 0) text_positions = np.expand_dims(text_positions, 0) model.eval() mel_outputs, linear_outputs, alignments, done = model.transduce( dg.to_variable(text), dg.to_variable(text_positions)) linear_outputs_np = linear_outputs.numpy()[0].T # (C, T) wav = spec_to_waveform(linear_outputs_np, min_level_db, ref_level_db, power, n_iter, win_length, hop_length, preemphasis) alignments_np = alignments.numpy()[0] # batch_size = 1 print("linear_outputs's shape: ", linear_outputs_np.shape) print("alignmnets' shape:", alignments.shape) return wav, alignments_np
def alignments(args): local_rank = dg.parallel.Env().local_rank place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()) with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) with dg.guard(place): network_cfg = cfg['network'] model = TransformerTTS( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], cfg['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) # Load parameters. global_step = io.load_parameters( model=model, checkpoint_path=args.checkpoint_transformer) model.eval() # get text data root = Path(args.data) csv_path = root.joinpath("metadata.csv") table = pd.read_csv(csv_path, sep="|", header=None, quoting=csv.QUOTE_NONE, names=["fname", "raw_text", "normalized_text"]) ljspeech_processor = audio.AudioProcessor( sample_rate=cfg['audio']['sr'], num_mels=cfg['audio']['num_mels'], min_level_db=cfg['audio']['min_level_db'], ref_level_db=cfg['audio']['ref_level_db'], n_fft=cfg['audio']['n_fft'], win_length=cfg['audio']['win_length'], hop_length=cfg['audio']['hop_length'], power=cfg['audio']['power'], preemphasis=cfg['audio']['preemphasis'], signal_norm=True, symmetric_norm=False, max_norm=1., mel_fmin=0, mel_fmax=None, clip_norm=True, griffin_lim_iters=60, do_trim_silence=False, sound_norm=False) pbar = tqdm(range(len(table))) alignments = OrderedDict() for i in pbar: fname, raw_text, normalized_text = table.iloc[i] # init input text = np.asarray(text_to_sequence(normalized_text)) text = fluid.layers.unsqueeze(dg.to_variable(text), [0]) pos_text = np.arange(1, text.shape[1] + 1) pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0]) wav = ljspeech_processor.load_wav( os.path.join(args.data, 'wavs', fname + ".wav")) mel_input = ljspeech_processor.melspectrogram(wav).astype( np.float32) mel_input = np.transpose(mel_input, axes=(1, 0)) mel_input = fluid.layers.unsqueeze(dg.to_variable(mel_input), [0]) mel_lens = mel_input.shape[1] dec_slf_mask = get_triu_tensor(mel_input, mel_input).astype(np.float32) dec_slf_mask = np.expand_dims(dec_slf_mask, axis=0) dec_slf_mask = fluid.layers.cast(dg.to_variable(dec_slf_mask != 0), np.float32) * (-2**32 + 1) pos_mel = np.arange(1, mel_input.shape[1] + 1) pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0]) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( text, mel_input, pos_text, pos_mel, dec_slf_mask) mel_input = fluid.layers.concat( [mel_input, postnet_pred[:, -1:, :]], axis=1) alignment, _ = get_alignment(attn_probs, mel_lens, network_cfg['decoder_num_head']) alignments[fname] = alignment with open(args.output + '.txt', "wb") as f: pickle.dump(alignments, f)
def _get_example(self, metadatum): wave_file, speaker, text = metadatum wav_path = self.wav_root.joinpath(speaker, wave_file) wav, sr = librosa.load(str(wav_path), sr=None) phoneme_seq = np.array(text_to_sequence(text)) return wav, self.speaker_indices[speaker], phoneme_seq
def synthesis(text_input, args): place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()) with open(args.config_path) as f: cfg = yaml.load(f, Loader=yaml.Loader) # tensorboard if not os.path.exists(args.log_dir): os.mkdir(args.log_dir) path = os.path.join(args.log_dir, 'synthesis') writer = SummaryWriter(path) with dg.guard(place): with fluid.unique_name.guard(): model = TransformerTTS(cfg) model.set_dict( load_checkpoint( str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer"))) model.eval() with fluid.unique_name.guard(): model_vocoder = Vocoder(cfg, args.batch_size) model_vocoder.set_dict( load_checkpoint( str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder"))) model_vocoder.eval() # init input text = np.asarray(text_to_sequence(text_input)) text = fluid.layers.unsqueeze(dg.to_variable(text), [0]) mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32) pos_text = np.arange(1, text.shape[1] + 1) pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0]) pbar = tqdm(range(args.max_len)) for i in pbar: dec_slf_mask = get_triu_tensor( mel_input.numpy(), mel_input.numpy()).astype(np.float32) dec_slf_mask = fluid.layers.cast( dg.to_variable(dec_slf_mask != 0), np.float32) * (-2**32 + 1) pos_mel = np.arange(1, mel_input.shape[1] + 1) pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0]) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( text, mel_input, pos_text, pos_mel, dec_slf_mask) mel_input = fluid.layers.concat( [mel_input, postnet_pred[:, -1:, :]], axis=1) mag_pred = model_vocoder(postnet_pred) _ljspeech_processor = audio.AudioProcessor( sample_rate=cfg['audio']['sr'], num_mels=cfg['audio']['num_mels'], min_level_db=cfg['audio']['min_level_db'], ref_level_db=cfg['audio']['ref_level_db'], n_fft=cfg['audio']['n_fft'], win_length=cfg['audio']['win_length'], hop_length=cfg['audio']['hop_length'], power=cfg['audio']['power'], preemphasis=cfg['audio']['preemphasis'], signal_norm=True, symmetric_norm=False, max_norm=1., mel_fmin=0, mel_fmax=None, clip_norm=True, griffin_lim_iters=60, do_trim_silence=False, sound_norm=False) wav = _ljspeech_processor.inv_spectrogram( fluid.layers.transpose( fluid.layers.squeeze(mag_pred, [0]), [1, 0]).numpy()) global_step = 0 for i, prob in enumerate(attn_probs): for j in range(4): x = np.uint8(cm.viridis(prob.numpy()[j]) * 255) writer.add_image( 'Attention_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") for i, prob in enumerate(attn_enc): for j in range(4): x = np.uint8(cm.viridis(prob.numpy()[j]) * 255) writer.add_image( 'Attention_enc_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") for i, prob in enumerate(attn_dec): for j in range(4): x = np.uint8(cm.viridis(prob.numpy()[j]) * 255) writer.add_image( 'Attention_dec_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") writer.add_audio(text_input, wav, 0, cfg['audio']['sr']) if not os.path.exists(args.sample_path): os.mkdir(args.sample_path) write( os.path.join(args.sample_path, 'test.wav'), cfg['audio']['sr'], wav) writer.close()
def synthesize(self, texts, use_gpu=False, vocoder="griffin-lim"): """ Get the synthetic wavs from the texts. Args: texts(list): the input texts to be predicted. use_gpu(bool): whether use gpu to predict or not vocoder(str): the vocoder name, "griffin-lim" or "waveflow" Returns: wavs(str): the audio wav with sample rate . You can use soundfile.write to save it. sample_rate(int): the audio sample rate. """ if use_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ: use_gpu = False logger.warning( "use_gpu has been set False as you didn't set the environment variable CUDA_VISIBLE_DEVICES while using use_gpu=True" ) place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() if texts and isinstance(texts, list): predicted_data = texts else: raise ValueError( "The input data is inconsistent with expectations.") wavs = [] with fluid.dygraph.guard(place): self.tts_model.eval() self.waveflow.model.eval() monotonic_layers = [4] for text in predicted_data: # init input logger.info("Processing sentence: %s" % text) text = en.text_to_sequence(text, p=1.0) text = np.expand_dims(np.array(text, dtype="int64"), 0) lengths = np.array([text.size], dtype=np.int64) text_seqs = dg.to_variable(text) text_lengths = dg.to_variable(lengths) decoder_layers = self.tts_config["decoder_layers"] force_monotonic_attention = [False] * decoder_layers for i in monotonic_layers: force_monotonic_attention[i] = True outputs = self.tts_model( text_seqs, text_lengths, speakers=None, force_monotonic_attention=force_monotonic_attention, window=(self.tts_config["backward_step"], self.tts_config["forward_step"])) decoded, refined, attentions = outputs if vocoder == 'griffin-lim': # synthesis use griffin-lim wav = self.griffin(refined.numpy()[0].T) elif vocoder == 'waveflow': # synthesis use waveflow wav = self.waveflow( fluid.layers.transpose(refined, [0, 2, 1])).numpy()[0] else: raise ValueError( 'vocoder error, we only support griffinlim and waveflow, but recevied %s.' % vocoder) wavs.append(wav) return wavs, self.tts_config["sample_rate"]
def synthesis(text_input, args): local_rank = dg.parallel.Env().local_rank place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()) with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) # tensorboard if not os.path.exists(args.output): os.mkdir(args.output) writer = SummaryWriter(os.path.join(args.output, 'log')) fluid.enable_dygraph(place) with fluid.unique_name.guard(): network_cfg = cfg['network'] model = TransformerTTS( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], cfg['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) # Load parameters. global_step = io.load_parameters( model=model, checkpoint_path=args.checkpoint_transformer) model.eval() with fluid.unique_name.guard(): model_vocoder = Vocoder(cfg['train']['batch_size'], cfg['vocoder']['hidden_size'], cfg['audio']['num_mels'], cfg['audio']['n_fft']) # Load parameters. global_step = io.load_parameters( model=model_vocoder, checkpoint_path=args.checkpoint_vocoder) model_vocoder.eval() # init input text = np.asarray(text_to_sequence(text_input)) text = fluid.layers.unsqueeze(dg.to_variable(text), [0]) mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32) pos_text = np.arange(1, text.shape[1] + 1) pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0]) pbar = tqdm(range(args.max_len)) for i in pbar: pos_mel = np.arange(1, mel_input.shape[1] + 1) pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0]) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( text, mel_input, pos_text, pos_mel) mel_input = fluid.layers.concat([mel_input, postnet_pred[:, -1:, :]], axis=1) mag_pred = model_vocoder(postnet_pred) _ljspeech_processor = audio.AudioProcessor( sample_rate=cfg['audio']['sr'], num_mels=cfg['audio']['num_mels'], min_level_db=cfg['audio']['min_level_db'], ref_level_db=cfg['audio']['ref_level_db'], n_fft=cfg['audio']['n_fft'], win_length=cfg['audio']['win_length'], hop_length=cfg['audio']['hop_length'], power=cfg['audio']['power'], preemphasis=cfg['audio']['preemphasis'], signal_norm=True, symmetric_norm=False, max_norm=1., mel_fmin=0, mel_fmax=None, clip_norm=True, griffin_lim_iters=60, do_trim_silence=False, sound_norm=False) # synthesis with cbhg wav = _ljspeech_processor.inv_spectrogram( fluid.layers.transpose(fluid.layers.squeeze(mag_pred, [0]), [1, 0]).numpy()) global_step = 0 for i, prob in enumerate(attn_probs): for j in range(4): x = np.uint8(cm.viridis(prob.numpy()[j]) * 255) writer.add_image('Attention_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") writer.add_audio(text_input + '(cbhg)', wav, 0, cfg['audio']['sr']) if not os.path.exists(os.path.join(args.output, 'samples')): os.mkdir(os.path.join(args.output, 'samples')) write(os.path.join(os.path.join(args.output, 'samples'), 'cbhg.wav'), cfg['audio']['sr'], wav) # synthesis with griffin-lim wav = _ljspeech_processor.inv_melspectrogram( fluid.layers.transpose(fluid.layers.squeeze(postnet_pred, [0]), [1, 0]).numpy()) writer.add_audio(text_input + '(griffin)', wav, 0, cfg['audio']['sr']) write(os.path.join(os.path.join(args.output, 'samples'), 'griffin.wav'), cfg['audio']['sr'], wav) print("Synthesis completed !!!") writer.close()
def synthesis(text_input, args): place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()) # tensorboard if not os.path.exists(args.log_dir): os.mkdir(args.log_dir) path = os.path.join(args.log_dir, 'synthesis') with open(args.config_path) as f: cfg = yaml.load(f, Loader=yaml.Loader) writer = SummaryWriter(path) with dg.guard(place): model = FastSpeech(cfg) model.set_dict( load_checkpoint( str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech"))) model.eval() text = np.asarray(text_to_sequence(text_input)) text = np.expand_dims(text, axis=0) pos_text = np.arange(1, text.shape[1] + 1) pos_text = np.expand_dims(pos_text, axis=0) enc_non_pad_mask = get_non_pad_mask(pos_text).astype(np.float32) enc_slf_attn_mask = get_attn_key_pad_mask(pos_text, text).astype(np.float32) text = dg.to_variable(text) pos_text = dg.to_variable(pos_text) enc_non_pad_mask = dg.to_variable(enc_non_pad_mask) enc_slf_attn_mask = dg.to_variable(enc_slf_attn_mask) mel_output, mel_output_postnet = model( text, pos_text, alpha=args.alpha, enc_non_pad_mask=enc_non_pad_mask, enc_slf_attn_mask=enc_slf_attn_mask, dec_non_pad_mask=None, dec_slf_attn_mask=None) _ljspeech_processor = audio.AudioProcessor( sample_rate=cfg['audio']['sr'], num_mels=cfg['audio']['num_mels'], min_level_db=cfg['audio']['min_level_db'], ref_level_db=cfg['audio']['ref_level_db'], n_fft=cfg['audio']['n_fft'], win_length=cfg['audio']['win_length'], hop_length=cfg['audio']['hop_length'], power=cfg['audio']['power'], preemphasis=cfg['audio']['preemphasis'], signal_norm=True, symmetric_norm=False, max_norm=1., mel_fmin=0, mel_fmax=None, clip_norm=True, griffin_lim_iters=60, do_trim_silence=False, sound_norm=False) mel_output_postnet = fluid.layers.transpose( fluid.layers.squeeze(mel_output_postnet, [0]), [1, 0]) wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy( )) writer.add_audio(text_input, wav, 0, cfg['audio']['sr']) print("Synthesis completed !!!") writer.close()
def synthesis(text_input, args): local_rank = dg.parallel.Env().local_rank place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()) with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) # tensorboard if not os.path.exists(args.output): os.mkdir(args.output) writer = SummaryWriter(os.path.join(args.output, 'log')) fluid.enable_dygraph(place) with fluid.unique_name.guard(): network_cfg = cfg['network'] model = TransformerTTS( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], cfg['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) # Load parameters. global_step = io.load_parameters( model=model, checkpoint_path=args.checkpoint_transformer) model.eval() # init input text = np.asarray(text_to_sequence(text_input)) text = fluid.layers.unsqueeze(dg.to_variable(text).astype(np.int64), [0]) mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32) pos_text = np.arange(1, text.shape[1] + 1) pos_text = fluid.layers.unsqueeze( dg.to_variable(pos_text).astype(np.int64), [0]) for i in range(args.max_len): pos_mel = np.arange(1, mel_input.shape[1] + 1) pos_mel = fluid.layers.unsqueeze( dg.to_variable(pos_mel).astype(np.int64), [0]) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( text, mel_input, pos_text, pos_mel) if stop_preds.numpy()[0, -1] > args.stop_threshold: break mel_input = fluid.layers.concat([mel_input, postnet_pred[:, -1:, :]], axis=1) global_step = 0 for i, prob in enumerate(attn_probs): for j in range(4): x = np.uint8(cm.viridis(prob.numpy()[j]) * 255) writer.add_image('Attention_%d_0' % global_step, x, i * 4 + j, dataformats="HWC") if args.vocoder == 'griffin-lim': #synthesis use griffin-lim wav = synthesis_with_griffinlim(postnet_pred, cfg['audio']) elif args.vocoder == 'waveflow': # synthesis use waveflow wav = synthesis_with_waveflow(postnet_pred, args, args.checkpoint_vocoder, place) else: print( 'vocoder error, we only support griffinlim and waveflow, but recevied %s.' % args.vocoder) writer.add_audio(text_input + '(' + args.vocoder + ')', wav, 0, cfg['audio']['sr']) if not os.path.exists(os.path.join(args.output, 'samples')): os.mkdir(os.path.join(args.output, 'samples')) write( os.path.join(os.path.join(args.output, 'samples'), args.vocoder + '.wav'), cfg['audio']['sr'], wav) print("Synthesis completed !!!") writer.close()
def alignments(args): local_rank = dg.parallel.Env().local_rank place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()) with open(args.config) as f: cfg = yaml.load(f, Loader=yaml.Loader) with dg.guard(place): network_cfg = cfg['network'] model = TransformerTTS( network_cfg['embedding_size'], network_cfg['hidden_size'], network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'], cfg['audio']['num_mels'], network_cfg['outputs_per_step'], network_cfg['decoder_num_head'], network_cfg['decoder_n_layers']) # Load parameters. global_step = io.load_parameters( model=model, checkpoint_path=args.checkpoint_transformer) model.eval() # get text data root = Path(args.data) csv_path = root.joinpath("metadata.csv") table = pd.read_csv( csv_path, sep="|", header=None, quoting=csv.QUOTE_NONE, names=["fname", "raw_text", "normalized_text"]) pbar = tqdm(range(len(table))) alignments = OrderedDict() for i in pbar: fname, raw_text, normalized_text = table.iloc[i] # init input text = np.asarray(text_to_sequence(normalized_text)) text = fluid.layers.unsqueeze(dg.to_variable(text), [0]) pos_text = np.arange(1, text.shape[1] + 1) pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0]) # load wav, _ = librosa.load( str(os.path.join(args.data, 'wavs', fname + ".wav"))) spec = librosa.stft( y=wav, n_fft=cfg['audio']['n_fft'], win_length=cfg['audio']['win_length'], hop_length=cfg['audio']['hop_length']) mag = np.abs(spec) mel = librosa.filters.mel(sr=cfg['audio']['sr'], n_fft=cfg['audio']['n_fft'], n_mels=cfg['audio']['num_mels'], fmin=cfg['audio']['fmin'], fmax=cfg['audio']['fmax']) mel = np.matmul(mel, mag) mel = np.log(np.maximum(mel, 1e-5)) mel_input = np.transpose(mel, axes=(1, 0)) mel_input = fluid.layers.unsqueeze(dg.to_variable(mel_input), [0]) mel_lens = mel_input.shape[1] pos_mel = np.arange(1, mel_input.shape[1] + 1) pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0]) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model( text, mel_input, pos_text, pos_mel) mel_input = fluid.layers.concat( [mel_input, postnet_pred[:, -1:, :]], axis=1) alignment, _ = get_alignment(attn_probs, mel_lens, network_cfg['decoder_num_head']) alignments[fname] = alignment with open(args.output + '.pkl', "wb") as f: pickle.dump(alignments, f)
def synthesize(self, texts, use_gpu=False, speed=1.0, vocoder="griffin-lim"): """ Get the synthetic wavs from the texts. Args: texts(list): the input texts to be predicted. use_gpu(bool): whether use gpu to predict or not. Default False. speed(float): Controlling the voice speed. Default 1.0. vocoder(str): the vocoder name, "griffin-lim" or "waveflow". Returns: wavs(str): the audio wav with sample rate . You can use soundfile.write to save it. sample_rate(int): the audio sample rate. """ if use_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ: use_gpu = False logger.warning( "use_gpu has been set False as you didn't set the environment variable CUDA_VISIBLE_DEVICES while using use_gpu=True" ) if use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() if texts and isinstance(texts, list): predicted_data = texts else: raise ValueError( "The input data is inconsistent with expectations.") wavs = [] with fluid.dygraph.guard(place): self.tts_model.eval() self.waveflow.eval() for text in predicted_data: # init input logger.info("Processing sentence: %s" % text) text = np.asarray(text_to_sequence(text)) text = np.expand_dims(text, axis=0) pos_text = np.arange(1, text.shape[1] + 1) pos_text = np.expand_dims(pos_text, axis=0) text = dg.to_variable(text).astype(np.int64) pos_text = dg.to_variable(pos_text).astype(np.int64) _, mel_output_postnet = self.tts_model(text, pos_text, alpha=1 / speed) if vocoder == 'griffin-lim': # synthesis use griffin-lim wav = self.synthesis_with_griffinlim( mel_output_postnet, self.tts_config['audio']) elif vocoder == 'waveflow': wav = self.synthesis_with_waveflow( mel_output_postnet, self.waveflow_config.sigma) else: raise ValueError( 'vocoder error, we only support griffinlim and waveflow, but recevied %s.' % vocoder) wavs.append(wav) return wavs, self.tts_config['audio']['sr']
def synthesize(self, texts, use_gpu=False, vocoder="griffin-lim"): """ Get the synthetic wavs from the texts. Args: texts(list): the input texts to be predicted. use_gpu(bool): whether use gpu to predict or not vocoder(str): the vocoder name, "griffin-lim" or "waveflow" Returns: wavs(str): the audio wav with sample rate . You can use soundfile.write to save it. sample_rate(int): the audio sample rate. """ if use_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ: use_gpu = False logger.warning( "use_gpu has been set False as you didn't set the environment variable CUDA_VISIBLE_DEVICES while using use_gpu=True" ) if use_gpu: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() if texts and isinstance(texts, list): predicted_data = texts else: raise ValueError( "The input data is inconsistent with expectations.") wavs = [] with fluid.dygraph.guard(place): self.tts_model.eval() self.waveflow.eval() for text in predicted_data: # init input logger.info("Processing sentence: %s" % text) text = np.asarray(text_to_sequence(text)) text = fluid.layers.unsqueeze( dg.to_variable(text).astype(np.int64), [0]) mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32) pos_text = np.arange(1, text.shape[1] + 1) pos_text = fluid.layers.unsqueeze( dg.to_variable(pos_text).astype(np.int64), [0]) for i in range(self.max_len): pos_mel = np.arange(1, mel_input.shape[1] + 1) pos_mel = fluid.layers.unsqueeze( dg.to_variable(pos_mel).astype(np.int64), [0]) mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = self.tts_model( text, mel_input, pos_text, pos_mel) if stop_preds.numpy()[0, -1] > self.stop_threshold: break mel_input = fluid.layers.concat( [mel_input, postnet_pred[:, -1:, :]], axis=1) if vocoder == 'griffin-lim': # synthesis use griffin-lim wav = self.synthesis_with_griffinlim( postnet_pred, self.tts_config['audio']) elif vocoder == 'waveflow': # synthesis use waveflow wav = self.synthesis_with_waveflow( postnet_pred, self.waveflow_config.sigma) else: raise ValueError( 'vocoder error, we only support griffinlim and waveflow, but recevied %s.' % vocoder) wavs.append(wav) return wavs, self.tts_config['audio']['sr']