def run_synthesis(in_dir, out_dir, model_dir, hparams): # This generates ground truth-aligned mels for vocoder training synth_dir = Path(out_dir).joinpath("mels_gta") synth_dir.mkdir(exist_ok=True) print(hparams_debug_string(hparams)) # Check for GPU if torch.cuda.is_available(): device = torch.device("cuda") if hparams.synthesis_batch_size % torch.cuda.device_count() != 0: raise ValueError("`hparams.synthesis_batch_size` must be evenly divisible by n_gpus!") else: device = torch.device("cpu") print("Synthesizer using device:", device) # Instantiate Tacotron model model = Tacotron(embed_dims=hparams.tts_embed_dims, num_chars=len(symbols), encoder_dims=hparams.tts_encoder_dims, decoder_dims=hparams.tts_decoder_dims, n_mels=hparams.num_mels, fft_bins=hparams.num_mels, postnet_dims=hparams.tts_postnet_dims, encoder_K=hparams.tts_encoder_K, lstm_dims=hparams.tts_lstm_dims, postnet_K=hparams.tts_postnet_K, num_highways=hparams.tts_num_highways, dropout=0., # Use zero dropout for gta mels stop_threshold=hparams.tts_stop_threshold, speaker_embedding_size=hparams.speaker_embedding_size).to(device) # Load the weights model_dir = Path(model_dir) model_fpath = model_dir.joinpath(model_dir.stem).with_suffix(".pt") print("\nLoading weights at %s" % model_fpath) model.load(model_fpath) print("Tacotron weights loaded from step %d" % model.step) # Synthesize using same reduction factor as the model is currently trained r = np.int32(model.r) # Set model to eval mode (disable gradient and zoneout) model.eval() # Initialize the dataset in_dir = Path(in_dir) metadata_fpath = in_dir.joinpath("train.txt") mel_dir = in_dir.joinpath("mels") embed_dir = in_dir.joinpath("embeds") dataset = SynthesizerDataset(metadata_fpath, mel_dir, embed_dir, hparams) data_loader = DataLoader(dataset, collate_fn=lambda batch: collate_synthesizer(batch, r,hparams), batch_size=hparams.synthesis_batch_size, num_workers=0, #Having an error(Can't pickle local object 'run_synthesis.<locals>.<lambda>') when training in Windows unless you set num_workers=0 shuffle=False, pin_memory=True) # Generate GTA mels meta_out_fpath = Path(out_dir).joinpath("synthesized.txt") with open(meta_out_fpath, "w") as file: for i, (texts, mels, embeds, idx) in tqdm(enumerate(data_loader), total=len(data_loader)): texts = texts.to(device) mels = mels.to(device) embeds = embeds.to(device) # Parallelize model onto GPUS using workaround due to python bug if device.type == "cuda" and torch.cuda.device_count() > 1: _, mels_out,_,_ = data_parallel_workaround(model, texts, mels, embeds) else: _,mels_out, _,_ = model(texts, mels, embeds) for j, k in enumerate(idx): # Note: outputs mel-spectrogram files and target ones have same names, just different folders mel_filename = Path(synth_dir).joinpath(dataset.metadata[k][1]) mel_out = mels_out[j].detach().cpu().numpy().T # Use the length of the ground truth mel to remove padding from the generated mels mel_out = mel_out[:int(dataset.metadata[k][4])] # Write the spectrogram to disk np.save(mel_filename, mel_out, allow_pickle=False) # Write metadata into the synthesized file file.write("|".join(dataset.metadata[k]))
class Synthesizer: sample_rate = hparams.sample_rate hparams = hparams def __init__(self, model_fpath: Path, verbose=True): """ The model isn't instantiated and loaded in memory until needed or until load() is called. :param model_fpath: path to the trained model file :param verbose: if False, prints less information when using the model """ self.model_fpath = model_fpath self.verbose = verbose # Check for GPU if torch.cuda.is_available(): self.device = torch.device("cuda") else: self.device = torch.device("cpu") if self.verbose: print("Synthesizer using device:", self.device) # Tacotron model will be instantiated later on first use. self._model = None def is_loaded(self): """ Whether the model is loaded in memory. """ return self._model is not None def load(self): """ Instantiates and loads the model given the weights file that was passed in the constructor. """ self._model = Tacotron( embed_dims=hparams.tts_embed_dims, num_chars=len(symbols), encoder_dims=hparams.tts_encoder_dims, decoder_dims=hparams.tts_decoder_dims, n_mels=hparams.num_mels, fft_bins=hparams.num_mels, postnet_dims=hparams.tts_postnet_dims, encoder_K=hparams.tts_encoder_K, lstm_dims=hparams.tts_lstm_dims, postnet_K=hparams.tts_postnet_K, num_highways=hparams.tts_num_highways, dropout=hparams.tts_dropout, stop_threshold=hparams.tts_stop_threshold, speaker_embedding_size=hparams.speaker_embedding_size).to( self.device) self._model.load(self.model_fpath) self._model.eval() if self.verbose: print("Loaded synthesizer \"%s\" trained to step %d" % (self.model_fpath.name, self._model.state_dict()["step"])) def synthesize_spectrograms(self, texts: List[str], embeddings: Union[np.ndarray, List[np.ndarray]], return_alignments=False): """ Synthesizes mel spectrograms from texts and speaker embeddings. :param texts: a list of N text prompts to be synthesized :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256) :param return_alignments: if True, a matrix representing the alignments between the characters and each decoder output step will be returned for each spectrogram :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the sequence length of spectrogram i, and possibly the alignments. """ # Load the model on the first request. if not self.is_loaded(): self.load() # Print some info about the model when it is loaded tts_k = self._model.get_step() // 1000 simple_table([("Tacotron", str(tts_k) + "k"), ("r", self._model.r)]) # Preprocess text inputs inputs = [ text_to_sequence(text.strip(), hparams.tts_cleaner_names) for text in texts ] if not isinstance(embeddings, list): embeddings = [embeddings] # Batch inputs batched_inputs = [ inputs[i:i + hparams.synthesis_batch_size] for i in range(0, len(inputs), hparams.synthesis_batch_size) ] batched_embeds = [ embeddings[i:i + hparams.synthesis_batch_size] for i in range(0, len(embeddings), hparams.synthesis_batch_size) ] specs = [] for i, batch in enumerate(batched_inputs, 1): if self.verbose: print(f"\n| Generating {i}/{len(batched_inputs)}") # Pad texts so they are all the same length text_lens = [len(text) for text in batch] max_text_len = max(text_lens) chars = [pad1d(text, max_text_len) for text in batch] chars = np.stack(chars) # Stack speaker embeddings into 2D array for batch processing speaker_embeds = np.stack(batched_embeds[i - 1]) # Convert to tensor chars = torch.tensor(chars).long().to(self.device) speaker_embeddings = torch.tensor(speaker_embeds).float().to( self.device) # Inference _, mels, alignments = self._model.generate(chars, speaker_embeddings) mels = mels.detach().cpu().numpy() for m in mels: # Trim silence from end of each spectrogram while np.max(m[:, -1]) < hparams.tts_stop_threshold: m = m[:, :-1] specs.append(m) if self.verbose: print("\n\nDone.\n") return (specs, alignments) if return_alignments else specs @staticmethod def load_preprocess_wav(fpath): """ Loads and preprocesses an audio file under the same conditions the audio files were used to train the synthesizer. """ wav = librosa.load(str(fpath), hparams.sample_rate)[0] if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max return wav @staticmethod def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]): """ Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that were fed to the synthesizer when training. """ if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): wav = Synthesizer.load_preprocess_wav(fpath_or_wav) else: wav = fpath_or_wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) return mel_spectrogram @staticmethod def griffin_lim(mel): """ Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built with the same parameters present in hparams.py. """ return audio.inv_mel_spectrogram(mel, hparams)