def _get_next_example(self): """Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk """ if self._train_offset >= len(self._train_meta): self._train_offset = 0 np.random.shuffle(self._train_meta) meta = self._train_meta[self._train_offset] self._train_offset += 1 text = meta[6] input_data = np.asarray(text_to_sequence(text, self._cleaner_names), dtype=np.int32) mel_target = np.load(os.path.join(self._mel_dir, meta[1])) #Create parallel sequences containing zeros to represent a non finished sequence token_target = np.asarray([0.] * (len(mel_target) - 1)) embed_target = np.load(os.path.join(self._embed_dir, meta[3])) linear_target = np.load(os.path.join(self._linear_dir, meta[2])) return input_data, mel_target, token_target, linear_target, embed_target, len(mel_target)
def my_synthesize(self, speaker_embeds, texts): """ Lighter synthesis function that directly returns the mel spectrograms. """ # Prepare the input # print('111111',speaker_embeds) # print('111111',speaker_embeds[0].shape) # print('2222222',texts) #speaker_embeds = tf.Session().run(speaker_embeds) #print('3333333',speaker_embeds) cleaner_names = [x.strip() for x in self._hparams.cleaners.split(",")] seqs = [ np.asarray(text_to_sequence(text, cleaner_names)) for text in texts ] input_lengths = [len(seq) for seq in seqs] input_seqs, max_seq_len = self._prepare_inputs(seqs) split_infos = [[max_seq_len, 0, 0, 0]] feed_dict = { self.inputs: input_seqs, self.input_lengths: np.asarray(input_lengths, dtype=np.int32), self.split_infos: np.asarray(split_infos, dtype=np.int32), self.speaker_embeddings: speaker_embeds } # Forward it mels, alignments, stop_tokens = self.session.run( [self.mel_outputs, self.alignments, self.stop_token_prediction], feed_dict=feed_dict) mels, alignments, stop_tokens = list( mels[0]), alignments[0], stop_tokens[0] # Trim the output for i in range(len(mels)): try: target_length = list(np.round(stop_tokens[i])).index(1) mels[i] = mels[i][:target_length, :] except ValueError: # If no token is generated, we simply do not trim the output continue return [mel.T for mel in mels], alignments
def __getitem__(self, index): # Sometimes index may be a list of 2 (not sure why this happens) # If that is the case, return a single item corresponding to first element in index if index is list: index = index[0] mel_path, embed_path = self.samples_fpaths[index] mel = np.load(mel_path).T.astype(np.float32) # Load the embed embed = np.load(embed_path) # Get the text and clean it text = text_to_sequence(self.samples_texts[index], self.hparams.tts_cleaner_names) # Convert the list returned by text_to_sequence to a numpy array text = np.asarray(text).astype(np.int32) return text, mel.astype(np.float32), embed.astype(np.float32), index
def synthesize(self, texts, basenames, out_dir, log_dir, mel_filenames, embed_filenames): hparams = self._hparams cleaner_names = [x.strip() for x in hparams.cleaners.split(",")] assert 0 == len(texts) % self._hparams.tacotron_num_gpus seqs = [ np.asarray(text_to_sequence(text, cleaner_names)) for text in texts ] input_lengths = [len(seq) for seq in seqs] size_per_device = len(seqs) // self._hparams.tacotron_num_gpus #Pad inputs according to each GPU max length input_seqs = None split_infos = [] for i in range(self._hparams.tacotron_num_gpus): device_input = seqs[size_per_device * i:size_per_device * (i + 1)] device_input, max_seq_len = self._prepare_inputs(device_input) input_seqs = np.concatenate( (input_seqs, device_input), axis=1) if input_seqs is not None else device_input split_infos.append([max_seq_len, 0, 0, 0]) feed_dict = { self.inputs: input_seqs, self.input_lengths: np.asarray(input_lengths, dtype=np.int32), } if self.gta: np_targets = [ np.load(mel_filename) for mel_filename in mel_filenames ] target_lengths = [len(np_target) for np_target in np_targets] #pad targets according to each GPU max length target_seqs = None for i in range(self._hparams.tacotron_num_gpus): device_target = np_targets[size_per_device * i:size_per_device * (i + 1)] device_target, max_target_len = self._prepare_targets( device_target, self._hparams.outputs_per_step) target_seqs = np.concatenate( (target_seqs, device_target), axis=1) if target_seqs is not None else device_target split_infos[i][ 1] = max_target_len #Not really used but setting it in case for future development maybe? feed_dict[self.targets] = target_seqs assert len(np_targets) == len(texts) feed_dict[self.split_infos] = np.asarray(split_infos, dtype=np.int32) feed_dict[self.speaker_embeddings] = [ np.load(f) for f in embed_filenames ] if self.gta or not hparams.predict_linear: mels, alignments, stop_tokens = self.session.run( [ self.mel_outputs, self.alignments, self.stop_token_prediction ], feed_dict=feed_dict) #Linearize outputs (1D arrays) mels = [mel for gpu_mels in mels for mel in gpu_mels] alignments = [ align for gpu_aligns in alignments for align in gpu_aligns ] stop_tokens = [ token for gpu_token in stop_tokens for token in gpu_token ] if not self.gta: #Natural batch synthesis #Get Mel lengths for the entire batch from stop_tokens predictions target_lengths = self._get_output_lengths(stop_tokens) #Take off the batch wise padding mels = [ mel[:target_length, :] for mel, target_length in zip(mels, target_lengths) ] assert len(mels) == len(texts) else: linears, mels, alignments, stop_tokens = self.session.run( [ self.linear_outputs, self.mel_outputs, self.alignments, self.stop_token_prediction ], feed_dict=feed_dict) #Linearize outputs (1D arrays) linears = [ linear for gpu_linear in linears for linear in gpu_linear ] mels = [mel for gpu_mels in mels for mel in gpu_mels] alignments = [ align for gpu_aligns in alignments for align in gpu_aligns ] stop_tokens = [ token for gpu_token in stop_tokens for token in gpu_token ] #Natural batch synthesis #Get Mel/Linear lengths for the entire batch from stop_tokens predictions # target_lengths = self._get_output_lengths(stop_tokens) target_lengths = [9999] #Take off the batch wise padding mels = [ mel[:target_length, :] for mel, target_length in zip(mels, target_lengths) ] linears = [ linear[:target_length, :] for linear, target_length in zip(linears, target_lengths) ] assert len(mels) == len(linears) == len(texts) if basenames is None: raise NotImplemented() saved_mels_paths = [] for i, mel in enumerate(mels): # Write the spectrogram to disk # Note: outputs mel-spectrogram files and target ones have same names, just different folders mel_filename = os.path.join(out_dir, "mel-{}.npy".format(basenames[i])) np.save(mel_filename, mel, allow_pickle=False) saved_mels_paths.append(mel_filename) if log_dir is not None: #save wav (mel -> wav) wav = audio.inv_mel_spectrogram(mel.T, hparams) audio.save_wav(wav, os.path.join( log_dir, "wavs/wav-{}-mel.wav".format(basenames[i])), sr=hparams.sample_rate) #save alignments plot.plot_alignment(alignments[i], os.path.join( log_dir, "plots/alignment-{}.png".format( basenames[i])), title="{}".format(texts[i]), split_title=True, max_len=target_lengths[i]) #save mel spectrogram plot plot.plot_spectrogram( mel, os.path.join(log_dir, "plots/mel-{}.png".format(basenames[i])), title="{}".format(texts[i]), split_title=True) if hparams.predict_linear: #save wav (linear -> wav) wav = audio.inv_linear_spectrogram(linears[i].T, hparams) audio.save_wav(wav, os.path.join( log_dir, "wavs/wav-{}-linear.wav".format( basenames[i])), sr=hparams.sample_rate) #save linear spectrogram plot plot.plot_spectrogram(linears[i], os.path.join( log_dir, "plots/linear-{}.png".format( basenames[i])), title="{}".format(texts[i]), split_title=True, auto_aspect=True) return saved_mels_paths
def synthesize_spectrograms(self, texts: List[str], embeddings: Union[np.ndarray, List[np.ndarray]], return_alignments=False): """ Synthesizes mel spectrograms from texts and speaker embeddings. :param texts: a list of N text prompts to be synthesized :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256) :param return_alignments: if True, a matrix representing the alignments between the characters and each decoder output step will be returned for each spectrogram :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the sequence length of spectrogram i, and possibly the alignments. """ # Load the model on the first request. if not self.is_loaded(): self.load() # Print some info about the model when it is loaded tts_k = self._model.get_step() // 1000 simple_table([("Tacotron", str(tts_k) + "k"), ("r", self._model.r)]) # Preprocess text inputs inputs = [ text_to_sequence(text.strip(), hparams.tts_cleaner_names) for text in texts ] if not isinstance(embeddings, list): embeddings = [embeddings] # Batch inputs batched_inputs = [ inputs[i:i + hparams.synthesis_batch_size] for i in range(0, len(inputs), hparams.synthesis_batch_size) ] batched_embeds = [ embeddings[i:i + hparams.synthesis_batch_size] for i in range(0, len(embeddings), hparams.synthesis_batch_size) ] specs = [] for i, batch in enumerate(batched_inputs, 1): if self.verbose: print(f"\n| Generating {i}/{len(batched_inputs)}") # Pad texts so they are all the same length text_lens = [len(text) for text in batch] max_text_len = max(text_lens) chars = [pad1d(text, max_text_len) for text in batch] chars = np.stack(chars) # Stack speaker embeddings into 2D array for batch processing speaker_embeds = np.stack(batched_embeds[i - 1]) # Convert to tensor chars = torch.tensor(chars).long().to(self.device) speaker_embeddings = torch.tensor(speaker_embeds).float().to( self.device) # Inference _, mels, alignments = self._model.generate(chars, speaker_embeddings) mels = mels.detach().cpu().numpy() for m in mels: # Trim silence from end of each spectrogram while np.max(m[:, -1]) < hparams.tts_stop_threshold: m = m[:, :-1] specs.append(m) if self.verbose: print("\n\nDone.\n") return (specs, alignments) if return_alignments else specs