def __init__(self, device='cpu'): dict_path = "downloads/data/lang_1char/train_no_dev_units.txt" model_path = "downloads/exp/train_no_dev_pytorch_train_pytorch_tacotron2.v3/results/model.last1.avg.best" vocoder_path = "downloads/ljspeech.parallel_wavegan.v1/checkpoint-400000steps.pkl" vocoder_conf = "downloads/ljspeech.parallel_wavegan.v1/config.yml" device = torch.device(device) idim, odim, train_args = get_model_conf(model_path) model_class = dynamic_import(train_args.model_module) model = model_class(idim, odim, train_args) torch_load(model_path, model) model = model.eval().to(device) inference_args = Namespace(**{"threshold": 0.5, "minlenratio": 0.0, "maxlenratio": 10.0}) with open(vocoder_conf) as f: config = yaml.load(f, Loader=yaml.Loader) vocoder = ParallelWaveGANGenerator(**config["generator_params"]) vocoder.load_state_dict(torch.load(vocoder_path, map_location="cpu")["model"]["generator"]) vocoder.remove_weight_norm() vocoder = vocoder.eval().to(device) with open(dict_path) as f: lines = f.readlines() lines = [line.replace("\n", "").split(" ") for line in lines] char_to_id = {c: int(i) for c, i in lines} self.device = device self.char_to_id = char_to_id self.idim = idim self.model = model self.inference_args = inference_args self.config = config self.vocoder = vocoder
def main(): """Run decoding process.""" parser = argparse.ArgumentParser( description="Decode dumped features with trained Parallel WaveGAN Generator.") parser.add_argument("--scp", default=None, type=str, help="Kaldi-style feats.scp file.") parser.add_argument("--dumpdir", default=None, type=str, help="Directory including feature files.") parser.add_argument("--outdir", default=None, type=str, required=True, help="Direcotry to save generated speech.") parser.add_argument("--checkpoint", default=None, type=str, required=True, help="Checkpoint file.") parser.add_argument("--config", default=None, type=str, help="Yaml format configuration file.") parser.add_argument("--verbose", type=int, default=1, help="logging level (higher is more logging)") args = parser.parse_args() # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") elif args.verbose > 0: logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") else: logging.basicConfig( level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") logging.warning("skip DEBUG/INFO messages") # check direcotry existence if not os.path.exists(args.outdir): os.makedirs(args.outdir) # load config if args.config is None: dirname = os.path.dirname(args.checkpoint) args.config = os.path.join(dirname, "config.yml") with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) # check arguments if (args.scp is not None and args.dumpdir is not None) or \ (args.scp is None and args.dumpdir is None): raise ValueError("Please specify either dumpdir or scp.") # get dataset if args.scp is None: if config["format"] == "hdf5": mel_query = "*.h5" mel_load_fn = lambda x: read_hdf5(x, "feats") # NOQA elif config["format"] == "npy": mel_query = "*-feats.npy" mel_load_fn = np.load else: raise ValueError("support only hdf5 or npy format.") dataset = MelDataset( args.dumpdir, mel_query=mel_query, mel_load_fn=mel_load_fn, return_filename=True) logging.info(f"the number of features to be decoded = {len(dataset)}.") else: dataset = kaldiio.ReadHelper(f"scp:{args.scp}") logging.info(f"the feature loaded from {args.scp}.") # setup if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") model = ParallelWaveGANGenerator(**config["generator_params"]) model.load_state_dict(torch.load(args.checkpoint, map_location="cpu")["model"]["generator"]) model.remove_weight_norm() model = model.eval().to(device) logging.info(f"loaded model parameters from {args.checkpoint}.") # start generation pad_size = (config["generator_params"]["aux_context_window"], config["generator_params"]["aux_context_window"]) total_rtf = 0.0 with torch.no_grad(), tqdm(dataset, desc="[decode]") as pbar: for idx, (feat_path, c) in enumerate(pbar, 1): # generate each utterance z = torch.randn(1, 1, c.shape[0] * config["hop_size"]).to(device) c = np.pad(c, (pad_size, (0, 0)), "edge") c = torch.FloatTensor(c).unsqueeze(0).transpose(2, 1).to(device) start = time.time() y = model(z, c).view(-1).cpu().numpy() rtf = (time.time() - start) / (len(y) / config["sampling_rate"]) pbar.set_postfix({"RTF": rtf}) total_rtf += rtf # save as PCM 16 bit wav file utt_id = os.path.splitext(os.path.basename(feat_path))[0] sf.write(os.path.join(config["outdir"], f"{utt_id}_gen.wav"), y, config["sampling_rate"], "PCM_16") # report average RTF logging.info(f"finished generation of {idx} utterances (RTF = {total_rtf / idx:.03f}).")
model = model_class(idim, odim, train_args) torch_load(model_path, model) model = model.eval().to(device) inference_args = Namespace(**{ "threshold": 0.5, "minlenratio": 0.0, "maxlenratio": 10.0 }) # define neural vocoder import yaml from parallel_wavegan.models import ParallelWaveGANGenerator with open(vocoder_conf) as f: config = yaml.load(f, Loader=yaml.Loader) vocoder = ParallelWaveGANGenerator(**config["generator_params"]) vocoder.load_state_dict( torch.load(vocoder_path, map_location="cpu")["model"]["generator"]) vocoder.remove_weight_norm() vocoder = vocoder.eval().to(device) # define text frontend from pypinyin import pinyin, Style from pypinyin.style._utils import get_initials, get_finals with open(dict_path) as f: lines = f.readlines() lines = [line.replace("\n", "").split(" ") for line in lines] char_to_id = {c: int(i) for c, i in lines} def frontend(text): """Clean text and then convert to id sequence.""" text = pinyin(text, style=Style.TONE3)
class Synthesizer(object): def __init__(self, config): self.wavernn = None self.pwgan = None self.config = config self.use_cuda = self.config.use_cuda if self.use_cuda: assert torch.cuda.is_available( ), "CUDA is not availabe on this machine." self.load_tts(self.config.tts_checkpoint, self.config.tts_config, self.config.use_cuda) if self.config.vocoder_checkpoint: self.load_vocoder(self.config.vocoder_checkpoint, self.config.vocoder_config, self.config.use_cuda) if self.config.wavernn_lib_path: self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_file, self.config.wavernn_config, self.config.use_cuda) if self.config.pwgan_file: self.load_pwgan(self.config.pwgan_lib_path, self.config.pwgan_file, self.config.pwgan_config, self.config.use_cuda) def load_tts(self, tts_checkpoint, tts_config, use_cuda): # pylint: disable=global-statement global symbols, phonemes print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > checkpoint file: ", tts_checkpoint) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if 'characters' in self.tts_config.keys(): symbols, phonemes = make_symbols(**self.tts_config.characters) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # TODO: fix this for multi-speaker model - load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping(self.config.tts_speakers) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(tts_checkpoint, map_location=torch.device('cpu')) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp: self.tts_model.decoder.set_r(cp['r']) print(f" > model reduction factor: {cp['r']}") def load_vocoder(self, model_file, model_config, use_cuda): self.vocoder_config = load_config(model_config) self.vocoder_model = setup_generator(self.vocoder_config) self.vocoder_model.load_state_dict( torch.load(model_file, map_location="cpu")["model"]) self.vocoder_model.remove_weight_norm() self.vocoder_model.inference_padding = 0 self.vocoder_config = load_config(model_config) if use_cuda: self.vocoder_model.cuda() self.vocoder_model.eval() def load_wavernn(self, lib_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. sys.path.append( lib_path) # set this if WaveRNN is not installed globally #pylint: disable=import-outside-toplevel from WaveRNN.models.wavernn import Model print(" > Loading WaveRNN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(model_config) # This is the default architecture we use for our models. # You might need to update it self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, mulaw=self.wavernn_config.mulaw, pad=self.wavernn_config.pad, use_aux_net=self.wavernn_config.use_aux_net, use_upsample_net=self.wavernn_config.use_upsample_net, upsample_factors=self.wavernn_config.upsample_factors, feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file, map_location="cpu") self.wavernn.load_state_dict(check['model']) if use_cuda: self.wavernn.cuda() self.wavernn.eval() def load_pwgan(self, lib_path, model_file, model_config, use_cuda): if lib_path: # set this if ParallelWaveGAN is not installed globally sys.path.append(lib_path) try: #pylint: disable=import-outside-toplevel from parallel_wavegan.models import ParallelWaveGANGenerator except ImportError as e: raise RuntimeError( f"cannot import parallel-wavegan, either install it or set its directory using the --pwgan_lib_path command line argument: {e}" ) print(" > Loading PWGAN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) with open(model_config) as f: self.pwgan_config = yaml.load(f, Loader=yaml.Loader) self.pwgan = ParallelWaveGANGenerator( **self.pwgan_config["generator_params"]) self.pwgan.load_state_dict( torch.load(model_file, map_location="cpu")["model"]["generator"]) self.pwgan.remove_weight_norm() if use_cuda: self.pwgan.cuda() self.pwgan.eval() def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) @staticmethod def split_into_sentences(text): text = " " + text + " <stop>" text = text.replace("\n", " ") text = re.sub(prefixes, "\\1<prd>", text) text = re.sub(websites, "<prd>\\1", text) if "Ph.D" in text: text = text.replace("Ph.D.", "Ph<prd>D<prd>") text = re.sub(r"\s" + alphabets + "[.] ", " \\1<prd> ", text) text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text) text = re.sub( alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text) text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>", text) text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2", text) text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text) text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text) if "”" in text: text = text.replace(".”", "”.") if "\"" in text: text = text.replace(".\"", "\".") if "!" in text: text = text.replace("!\"", "\"!") if "?" in text: text = text.replace("?\"", "\"?") text = text.replace(".", ".<stop>") text = text.replace("?", "?<stop>") text = text.replace("!", "!<stop>") text = text.replace("<prd>", ".") sentences = text.split("<stop>") sentences = sentences[:-1] sentences = list(filter( None, [s.strip() for s in sentences])) # remove empty sentences return sentences def tts(self, text, speaker_id=None): start_time = time.time() wavs = [] sens = self.split_into_sentences(text) print(sens) speaker_id = id_to_torch(speaker_id) if speaker_id is not None and self.use_cuda: speaker_id = speaker_id.cuda() for sen in sens: # preprocess the given text inputs = text_to_seqvec(sen, self.tts_config) inputs = numpy_to_torch(inputs, torch.long, cuda=self.use_cuda) inputs = inputs.unsqueeze(0) # synthesize voice decoder_output, postnet_output, alignments, stop_tokens = run_model_torch( self.tts_model, inputs, self.tts_config, False, speaker_id, None) # convert outputs to numpy if self.vocoder_model: vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0) wav = self.vocoder_model.inference(vocoder_input) if self.use_cuda: wav = wav.cpu().numpy() else: wav = wav.numpy() wav = wav.flatten() elif self.wavernn: vocoder_input = None if self.tts_config.model == "Tacotron": vocoder_input = torch.FloatTensor( self.ap.out_linear_to_mel( linear_spec=postnet_output.T).T).T.unsqueeze(0) else: vocoder_input = postnet_output[0].transpose(0, 1).unsqueeze(0) if self.use_cuda: vocoder_input.cuda() wav = self.wavernn.generate( vocoder_input, batched=self.config.is_wavernn_batched, target=11000, overlap=550) # trim silence wav = trim_silence(wav, self.ap) wavs += list(wav) wavs += [0] * 10000 out = io.BytesIO() self.save_wav(wavs, out) # compute stats process_time = time.time() - start_time audio_time = len(wavs) / self.tts_config.audio['sample_rate'] print(f" > Processing time: {process_time}") print(f" > Real-time factor: {process_time / audio_time}") return out
class TTS(): def __init__(self, conf): if conf["cuda"]: self.device = torch.device("cuda") else: self.device = torch.device("cpu") self.conf = MODEL_CONF[conf["model"]] # define E2E-TTS model self.idim, odim, train_args = get_model_conf(self.conf["model_path"]) model_class = dynamic_import(train_args.model_module) self.model = model_class(self.idim, odim, train_args) torch_load(self.conf["model_path"], self.model) self.model = self.model.eval().to(self.device) # load neural vocoder with open(VOCODER_CONF["vocoder_conf"]) as f: self.vocoder_config = yaml.load(f, Loader=yaml.Loader) self.vocoder = ParallelWaveGANGenerator( **self.vocoder_config["generator_params"]) self.vocoder.load_state_dict(\ torch.load(VOCODER_CONF["vocoder_path"], map_location="cpu")["model"]["generator"]) self.vocoder.remove_weight_norm() self.vocoder = self.vocoder.eval().to(self.device) # define character-to-id dictionary with open(self.conf["dict_path"]) as f: lines = f.readlines() lines = [line.replace("\n", "").split(" ") for line in lines] self.char_to_id = {c: int(i) for c, i in lines} def __frontend(self, text): """Clean text and then convert to id sequence.""" g2p = G2p() text = custom_english_cleaners(text) if self.conf["trans_type"] == "phn": text = filter(lambda s: s != " ", g2p(text)) text = " ".join(text) print(f"Cleaned text: {text}") charseq = text.split(" ") elif self.conf["trans_type"] == "char": print(f"Cleaned text: {text}") charseq = list(text) idseq = [] for c in charseq: if c.isspace(): idseq += [self.char_to_id["<space>"]] elif c not in self.char_to_id.keys(): idseq += [self.char_to_id["<unk>"]] else: idseq += [self.char_to_id[c]] idseq += [self.idim - 1] # <eos> return torch.LongTensor(idseq).view(-1).to(self.device) def synthesize(self, input_text): """ This method turns text into audio data Args: input_text (str): the user input text Returns: 1D numpy.array of the audio data where the data is: -> mono (just one channel) -> sample rate is 22050 Hz -> 32-bit floating-point """ with torch.no_grad(): x = self.__frontend(input_text) inference_args = Namespace(**{ "threshold": 0.5, "minlenratio": 0.0, "maxlenratio": 10.0 }) c, _, _ = self.model.inference(x, inference_args) z = torch.randn(1, 1, c.size(0) * self.vocoder_config["hop_size"]).to( self.device) c = torch.nn.ReplicationPad1d( self.vocoder_config["generator_params"]["aux_context_window"])( c.unsqueeze(0).transpose(2, 1)) y = self.vocoder(z, c).view(-1) wav = y.view(-1).cpu().numpy() return wav, self.vocoder_config["sampling_rate"]
class Synthesizer(object): def __init__(self, config): self.wavernn = None self.pwgan = None self.config = config self.use_cuda = self.config.use_cuda if self.use_cuda: assert torch.cuda.is_available( ), "CUDA is not availabe on this machine." self.load_tts(self.config.tts_checkpoint, self.config.tts_config, self.config.use_cuda) if self.config.wavernn_lib_path: self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_file, self.config.wavernn_config, self.config.use_cuda) if self.config.pwgan_file: self.load_pwgan(self.config.pwgan_file, self.config.pwgan_config, self.config.use_cuda) def load_tts(self, tts_checkpoint, tts_config, use_cuda): print(" > Loading TTS model ...") print(" | > model config: ", tts_config) print(" | > checkpoint file: ", tts_checkpoint) self.tts_config = load_config(tts_config) self.use_phonemes = self.tts_config.use_phonemes self.ap = AudioProcessor(**self.tts_config.audio) if self.use_phonemes: self.input_size = len(phonemes) else: self.input_size = len(symbols) # TODO: fix this for multi-speaker model - load speakers if self.config.tts_speakers is not None: self.tts_speakers = load_speaker_mapping(self.config.tts_speakers) num_speakers = len(self.tts_speakers) else: num_speakers = 0 self.tts_model = setup_model(self.input_size, num_speakers=num_speakers, c=self.tts_config) # load model state cp = torch.load(tts_checkpoint, map_location=torch.device('cpu')) # load the model self.tts_model.load_state_dict(cp['model']) if use_cuda: self.tts_model.cuda() self.tts_model.eval() self.tts_model.decoder.max_decoder_steps = 3000 if 'r' in cp: self.tts_model.decoder.set_r(cp['r']) def load_wavernn(self, lib_path, model_file, model_config, use_cuda): # TODO: set a function in wavernn code base for model setup and call it here. sys.path.append( lib_path) # set this if WaveRNN is not installed globally #pylint: disable=import-outside-toplevel from WaveRNN.models.wavernn import Model print(" > Loading WaveRNN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) self.wavernn_config = load_config(model_config) # This is the default architecture we use for our models. # You might need to update it self.wavernn = Model( rnn_dims=512, fc_dims=512, mode=self.wavernn_config.mode, mulaw=self.wavernn_config.mulaw, pad=self.wavernn_config.pad, use_aux_net=self.wavernn_config.use_aux_net, use_upsample_net=self.wavernn_config.use_upsample_net, upsample_factors=self.wavernn_config.upsample_factors, feat_dims=80, compute_dims=128, res_out_dims=128, res_blocks=10, hop_length=self.ap.hop_length, sample_rate=self.ap.sample_rate, ).cuda() check = torch.load(model_file) self.wavernn.load_state_dict(check['model'], map_location="cpu") if use_cuda: self.wavernn.cuda() self.wavernn.eval() def load_pwgan(self, model_file, model_config, use_cuda): #pylint: disable=import-outside-toplevel from parallel_wavegan.models import ParallelWaveGANGenerator from parallel_wavegan.utils.audio import AudioProcessor as AudioProcessorVocoder print(" > Loading PWGAN model ...") print(" | > model config: ", model_config) print(" | > model file: ", model_file) with open(model_config) as f: self.pwgan_config = yaml.load(f, Loader=yaml.Loader) self.pwgan = ParallelWaveGANGenerator( **self.pwgan_config["generator_params"]) self.pwgan.load_state_dict( torch.load(model_file, map_location="cpu")["model"]["generator"]) self.pwgan.remove_weight_norm() self.pwgan_ap = AudioProcessorVocoder(**self.pwgan_config["audio"]) if use_cuda: self.pwgan.cuda() self.pwgan.eval() def save_wav(self, wav, path): # wav *= 32767 / max(1e-8, np.max(np.abs(wav))) wav = np.array(wav) self.ap.save_wav(wav, path) def split_into_sentences(self, text): text = " " + text + " <stop>" text = text.replace("\n", " ") text = re.sub(prefixes, "\\1<prd>", text) text = re.sub(websites, "<prd>\\1", text) if "Ph.D" in text: text = text.replace("Ph.D.", "Ph<prd>D<prd>") text = re.sub(r"\s" + alphabets + "[.] ", " \\1<prd> ", text) text = re.sub(acronyms + " " + starters, "\\1<stop> \\2", text) text = re.sub( alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text) text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>", text) text = re.sub(" " + suffixes + "[.] " + starters, " \\1<stop> \\2", text) text = re.sub(" " + suffixes + "[.]", " \\1<prd>", text) text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text) if "”" in text: text = text.replace(".”", "”.") if "\"" in text: text = text.replace(".\"", "\".") if "!" in text: text = text.replace("!\"", "\"!") if "?" in text: text = text.replace("?\"", "\"?") text = text.replace(".", ".<stop>") text = text.replace("?", "?<stop>") text = text.replace("!", "!<stop>") text = text.replace("<prd>", ".") sentences = text.split("<stop>") sentences = sentences[:-1] sentences = list(filter(None, [s.strip() for s in sentences])) return sentences def tts(self, text): wavs = [] sens = self.split_into_sentences(text) print(sens) if not sens: sens = [text + '.'] for sen in sens: # preprocess the given text inputs = text_to_seqvec(sen, self.tts_config, self.use_cuda) # synthesize voice decoder_output, postnet_output, alignments, _ = run_model( self.tts_model, inputs, self.tts_config, False, None, None) # convert outputs to numpy postnet_output, decoder_output, _ = parse_outputs( postnet_output, decoder_output, alignments) if self.pwgan: input_tensor = torch.FloatTensor(postnet_output.T).unsqueeze(0) if self.use_cuda: input_tensor.cuda() wav = self.pwgan.inference( input_tensor, hop_size=self.pwgan_ap.hop_length).data.cpu().numpy() else: wav = inv_spectrogram(postnet_output, self.ap, self.tts_config) # trim silence wav = trim_silence(wav, self.ap) wavs += list(wav) wavs += [0] * 10000 out = io.BytesIO() self.save_wav(wavs, out) return out
model.load_state_dict(cp['model']) if use_cuda: model.cuda() model.eval() print(cp['step']) print(cp['r']) # set model stepsize if 'r' in cp: model.decoder.set_r(cp['r']) # load PWGAN if use_gl == False: vocoder_model = ParallelWaveGANGenerator( **PWGAN_CONFIG["generator_params"]) vocoder_model.load_state_dict( torch.load(PWGAN_MODEL, map_location="cpu")["model"]["generator"]) vocoder_model.remove_weight_norm() ap_vocoder = AudioProcessorVocoder(**PWGAN_CONFIG['audio']) if use_cuda: vocoder_model.cuda() vocoder_model.eval() data = '' with open('configuration/text/result/final.txt', 'r') as myfile: data = myfile.read() data = data.replace(';', '') sentence = data.split('.') sentencesClean = [] for x in sentence[:-1]:
def __init__(self, domain: Domain = "", identifier: str = None, use_cuda=False, sub_topic_domains: Dict[str, str] = {}): """ Text To Speech Module that reads out the system utterance. Args: domain (Domain): Needed for Service, no meaning here identifier (string): Needed for Service use_cuda (boolean): Whether or not to perform computations on GPU. Highly recommended if available sub_topic_domains: see `services.service.Service` constructor for more details """ Service.__init__(self, domain=domain, identifier=identifier, sub_topic_domains=sub_topic_domains) self.models_directory = os.path.join(get_root_dir(), "resources", "models", "speech") # The following lines can be changed to incorporate different models. # This is the only thing that needs to be changed for that, everything else should be dynamic. self.transcription_type = "phn" self.dict_path = os.path.join(self.models_directory, "phn_train_no_dev_pytorch_train_fastspeech.v4", "data", "lang_1phn", "train_no_dev_units.txt") self.model_path = os.path.join(self.models_directory, "phn_train_no_dev_pytorch_train_fastspeech.v4", "exp", "phn_train_no_dev_pytorch_train_fastspeech.v4", "results", "model.last1.avg.best") self.vocoder_path = os.path.join(self.models_directory, "ljspeech.parallel_wavegan.v1", "checkpoint-400000steps.pkl") self.vocoder_conf = os.path.join(self.models_directory, "ljspeech.parallel_wavegan.v1", "config.yml") # define device to run the synthesis on if use_cuda: self.device = torch.device("cuda") else: self.device = torch.device("cpu") # define end to end TTS model self.input_dimensions, self.output_dimensions, self.train_args = get_model_conf(self.model_path) model_class = dynamic_import.dynamic_import(self.train_args.model_module) model = model_class(self.input_dimensions, self.output_dimensions, self.train_args) torch_load(self.model_path, model) self.model = model.eval().to(self.device) self.inference_args = Namespace(**{"threshold": 0.5, "minlenratio": 0.0, "maxlenratio": 10.0}) # define neural vocoder with open(self.vocoder_conf) as vocoder_config_file: self.config = yaml.load(vocoder_config_file, Loader=yaml.Loader) vocoder = ParallelWaveGANGenerator(**self.config["generator_params"]) vocoder.load_state_dict(torch.load(self.vocoder_path, map_location="cpu")["model"]["generator"]) vocoder.remove_weight_norm() self.vocoder = vocoder.eval().to(self.device) with open(self.dict_path) as dictionary_file: lines = dictionary_file.readlines() lines = [line.replace("\n", "").split(" ") for line in lines] self.char_to_id = {c: int(i) for c, i in lines} self.g2p = G2p() # download the pretrained Punkt tokenizer from NLTK. This is done only # the first time the code is executed on a machine, if it has been done # before, this line will be skipped and output a warning. We will probably # redirect warnings into a file rather than std_err in the future, since # there's also a lot of pytorch warnings going on etc. nltk.download('punkt', quiet=True)