def __init__(self, model_name: str = 'multiband_mel_gan_vctk', device='cpu'): super().__init__() assert model_name in PARAMS['models'], \ 'Model name {} is not valid! choose in {}'.format( model_name, str(PARAMS['models'].keys())) model_name_mapping = PARAMS['models'][model_name] self.device = device self.encoder = MelSpectrogram(**PARAMS['audio'][model_name]) self.vocoder = load_model( download_pretrained_model(model_name_mapping)).to(device).eval() self.vocoder.remove_weight_norm() # make stat tensors param_key = 'vctk' if 'vctk' in model_name else 'lj' stats = MULTI_BAND_MEL_GAN_PARAMS[param_key] self.mean = torch.FloatTensor( stats['mean']).unsqueeze(0).unsqueeze(-1).to(device) self.scale = torch.FloatTensor( stats['scale']).unsqueeze(0).unsqueeze(-1).to(device) # print params print('Total Model {} params.'.format(self.num_params(self.vocoder)))
def __init__(self, worker_id=1, audio_dest='audio/', audio_format='.wav'): self.id = worker_id #Model selection self.fs = 22050 self.lang = "English" self.tag = "kan-bayashi/ljspeech_tacotron2" self.vocoder_tag = "ljspeech_parallel_wavegan.v1" #Model setup self.d = ModelDownloader() self.text2speech = Text2Speech( **self.d.download_and_unpack(self.tag), device="cpu", # Only for Tacotron 2 threshold=0.5, minlenratio=0.0, maxlenratio=10.0, use_att_constraint=False, backward_window=1, forward_window=3, ) self.vocoder = load_model(download_pretrained_model( self.vocoder_tag)).to("cpu").eval() self.text2speech.spc2wav = None self.vocoder.remove_weight_norm() self.audio_d = audio_dest self.audio_f = audio_format
def __init__( self, model_file: Union[Path, str], config_file: Optional[Union[Path, str]] = None, ): """Initialize ParallelWaveGANPretrainedVocoder module.""" super().__init__() try: from parallel_wavegan.utils import load_model except ImportError: logging.error( "`parallel_wavegan` is not installed. " "Please install via `pip install -U parallel_wavegan`.") raise if config_file is None: dirname = os.path.dirname(str(model_file)) config_file = os.path.join(dirname, "config.yml") with open(config_file) as f: config = yaml.load(f, Loader=yaml.Loader) self.fs = config["sampling_rate"] self.vocoder = load_model(model_file, config) if hasattr(self.vocoder, "remove_weight_norm"): self.vocoder.remove_weight_norm() self.normalize_before = False if hasattr(self.vocoder, "mean"): self.normalize_before = True
def __init__(self, torch_device=None): if torch_device is None: if torch.cuda.is_available(): torch_device = 'cuda' else: torch_device = 'cpu' self.tacotron_file_config = path.join(TTS_WORK_DIR, TTS_CONFIG_FILE) self.tacotron_file_checkpoints = path.join(TTS_WORK_DIR, TTS_MODEL_WEIGHTS) self.vocoder_file_config = path.join(TTS_WORK_DIR, TTS_CONFIG_FILE) self.vocoder_file_checkpoints = path.join(VOCODER_WORK_DIR, VOCODER_MODEL_WEIGHTS) # Tacotron2 Loading self.tacotron_instance = Text2Speech( self.tacotron_file_config, self.tacotron_file_checkpoints, device=torch_device, threshold=0.5, minlenratio=0.0, maxlenratio=10.0, use_att_constraint=False, backward_window=1, forward_window=3 ) self.tacotron_instance.spc2wav = None # Vocoder Loading self.vocoder = load_model(self.vocoder_file_checkpoints)\ .to(torch_device)\ .eval() self.vocoder.remove_weight_norm()
def setup_model(self): try: self.model_reload_needed = False self.output_status("Loading nltk...") # setup nltk import nltk nltk.data.path.append(MODEL_DIR + '/nltk_models') try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt', download_dir=MODEL_DIR + "/nltk_models") self.output_status("Loading torch...", end=" ") # setup model import torch from espnet_model_zoo.downloader import ModelDownloader from espnet2.bin.tts_inference import Text2Speech from parallel_wavegan.utils import download_pretrained_model from parallel_wavegan.utils import load_model self.mlDevice = "cuda" if torch.cuda.is_available() else "cpu" self.output_status("Running on " + self.mlDevice) self.output_status("Loading espnet...") d = ModelDownloader(MODEL_DIR + "/espnet_models") self.text2speech = Text2Speech( **d.download_and_unpack(self.tag), device=self.mlDevice, # Only for Tacotron 2 threshold=0.5, minlenratio=0.0, maxlenratio=10.0, use_att_constraint=False, backward_window=1, forward_window=3, # Only for FastSpeech & FastSpeech2 speed_control_alpha=1.0, ) self.text2speech.spc2wav = None # Disable griffin-lim # NOTE: Sometimes download is failed due to "Permission denied". That is # the limitation of google drive. Please retry after serveral hours. self.output_status("Loading vocoder models...") self.vocoder = load_model( download_pretrained_model(self.vocoder_tag, download_dir=MODEL_DIR + "/vocoder_models")).to( self.mlDevice).eval() self.vocoder.remove_weight_norm() self.output_status("Model setup completed.") except Exception as e: self.output_err("Model error", e) raise HandledException()
def load_new_model(self, checkpoint): # setup model if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") device = torch.device('cpu') self.device = device model = load_model(checkpoint, self.config) #logging.info(f"Loaded model parameters from {checkpoint}.") model.remove_weight_norm() model = model.eval().to(device) return model
def perform_tts(input_text): idim, odim, train_args = get_model_conf(model_path) model_class = dynamic_import(train_args.model_module) model = model_class(idim, odim, train_args) torch_load(model_path, model) model = model.eval().to(device) inference_args = Namespace( **{ "threshold": 0.5, "minlenratio": 0.0, "maxlenratio": 10.0, # Only for Tacotron 2 "use_attention_constraint": True, "backward_window": 1, "forward_window": 3, # Only for fastspeech (lower than 1.0 is faster speech, higher than 1.0 is slower speech) "fastspeech_alpha": 1.0, }) # define neural vocoder fs = 22050 vocoder = load_model(vocoder_path) vocoder.remove_weight_norm() vocoder = vocoder.eval().to(device) # define text frontend with open(dict_path) as f: lines = f.readlines() lines = [line.replace("\n", "").split(" ") for line in lines] char_to_id = {c: int(i) for c, i in lines} g2p = G2p() print('input : ', input_text) with torch.no_grad(): start = time.time() x = frontend(input_text, g2p, char_to_id, idim) c, _, _ = model.inference(x, inference_args) y = vocoder.inference(c) rtf = (time.time() - start) / (len(y) / fs) print(f"RTF = {rtf:5f}") print(y) write("static/test.wav", fs, y.view(-1).cpu().numpy())
def __init__(self, checkpoint, config): # load config if config is None: dirname = os.path.dirname(checkpoint) config = os.path.join(dirname, "config.yml") with open(config) as f: config = yaml.load(f, Loader=yaml.Loader) self.config = config # setup model if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") self.device = device model = load_model(checkpoint, config) logging.info(f"Loaded model parameters from {checkpoint}.") model.remove_weight_norm() self.model = model.eval().to(device)
def test_parallel_wavegan_compatibility(): from parallel_wavegan.utils import download_pretrained_model from parallel_wavegan.utils import load_model ckpt_path = download_pretrained_model("ljspeech_hifigan.v1") state_dict = torch.load(ckpt_path, map_location="cpu")["model"]["generator"] model_pwg = load_model(ckpt_path) model_espnet2 = HiFiGANGenerator() model_espnet2.load_state_dict(state_dict) model_pwg.eval() model_espnet2.eval() with torch.no_grad(): c = torch.randn(5, 80) out_pwg = model_pwg.inference(c) out_espnet2 = model_espnet2.inference(c) np.testing.assert_array_equal( out_pwg.cpu().numpy(), out_espnet2.cpu().numpy(), )
def __init__(self, checkpoint, config=None): """ Parameters ---------- checkpoint: str, the path of model checkpoint file. config: str, the path of model configuration file. """ # load config if config is None: dirname = os.path.dirname(checkpoint) config = os.path.join(dirname, "config.yml") with open(config) as f: self._config = yaml.load(f, Loader=yaml.Loader) # setup model if torch.cuda.is_available(): self._device = torch.device("cuda") else: self._device = torch.device("cpu") self._model = load_model(checkpoint, self._config) self._model.remove_weight_norm() self._model = self._model.eval().to(self._device)
# scaler.mean_ = np.load(args.stats)[0] # scaler.scale_ = np.load(args.stats)[1] else: raise ValueError("support only hdf5 (and normally npy - but not now) format.") # from version 0.23.0, this information is needed scaler.n_features_in_ = scaler.mean_.shape[0] mel = scaler.transform(mel) # plt.imshow(mel) # plt.show() #==============================================Put it through network================================================== # converter.output_to_wav([[mel]]) print(f"Now loading in pretrained melGAN model") download_pretrained_model("vctk_multi_band_melgan.v2", "melgan") model = load_model("melgan/vctk_multi_band_melgan.v2/checkpoint-1000000steps.pkl") model.remove_weight_norm() model = model.eval().to(device) result = model.inference(torch.tensor(mel, dtype=torch.float).to(device)).view(-1) # from playsound import playsound # import pyaudio # p = pyaudio.PyAudio() # stream = p.open(format=p.get_format_from_width(wf.getsampwidth()), # channels=wf.getnchannels(), # rate=wf.getframerate(), # output=True) x, sr = sf.read(utility.get_full_path(".\\input\\p225\\p225_001.wav"))
from espnet_model_zoo.downloader import ModelDownloader import soundfile as sf import librosa import numpy as np import os import kaldiio d = ModelDownloader() # tag = 'kan-bayashi/libritts_gst+xvector_transformer' text2speech = Text2Speech( "/nolan/test/espnet/egs2/vctk/tts1/exp/tts_train_gst_fastspeech2_raw_phn_tacotron_g2p_en_no_space/config.yaml", "/nolan/test/espnet/egs2/vctk/tts1/exp/tts_train_gst_fastspeech2_raw_phn_tacotron_g2p_en_no_space/train.loss.best.pth", device="cuda") # text2speech.spc2wav = None vocoder = load_model( download_pretrained_model("libritts_parallel_wavegan.v1.long")).to( "cuda").eval() vocoder.remove_weight_norm() spembs = None if text2speech.use_speech: speech, fs = sf.read("/nolan/VCTK-Corpus/wav48/p226/p226_001.wav") # speech, _ = librosa.load("/nolan/VCTK-Corpus/wav48/p225/p225_001.wav", text2speech.fs) speech = torch.from_numpy(speech).float().cuda() # speech = torch.randn(50000,) texts = [ "Mostly I would recommend giving a quick look to the figures beyond the introduction.", ] for i, text in enumerate(texts): with torch.no_grad():
def main(): """Run decoding process.""" parser = argparse.ArgumentParser( description= "Decode dumped features with trained Parallel WaveGAN Generator " "(See detail in parallel_wavegan/bin/decode.py).") parser.add_argument("--feats-scp", "--scp", default=None, type=str, help="kaldi-style feats.scp file. " "you need to specify either feats-scp or dumpdir.") parser.add_argument("--dumpdir", default=None, type=str, help="directory including feature files. " "you need to specify either feats-scp or dumpdir.") parser.add_argument("--outdir", type=str, required=True, help="directory to save generated speech.") parser.add_argument("--checkpoint", type=str, required=True, help="checkpoint file to be loaded.") parser.add_argument( "--config", default=None, type=str, help="yaml format configuration file. if not explicitly provided, " "it will be searched in the checkpoint directory. (default=None)") parser.add_argument( "--verbose", type=int, default=1, help="logging level. higher is more logging. (default=1)") args = parser.parse_args() # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") elif args.verbose > 0: logging.basicConfig( level=logging.INFO, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") else: logging.basicConfig( level=logging.WARN, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") logging.warning("Skip DEBUG/INFO messages") # check directory existence if not os.path.exists(args.outdir): os.makedirs(args.outdir) # load config if args.config is None: dirname = os.path.dirname(args.checkpoint) args.config = os.path.join(dirname, "config.yml") with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) # check arguments if (args.feats_scp is not None and args.dumpdir is not None) or \ (args.feats_scp is None and args.dumpdir is None): raise ValueError("Please specify either --dumpdir or --feats-scp.") # get dataset if args.dumpdir is not None: if config["format"] == "hdf5": mel_query = "*.h5" mel_load_fn = lambda x: read_hdf5(x, "feats") # NOQA elif config["format"] == "npy": mel_query = "*-feats.npy" mel_load_fn = np.load else: raise ValueError("Support only hdf5 or npy format.") dataset = MelDataset( args.dumpdir, mel_query=mel_query, mel_load_fn=mel_load_fn, return_utt_id=True, ) else: dataset = MelSCPDataset( feats_scp=args.feats_scp, return_utt_id=True, ) logging.info(f"The number of features to be decoded = {len(dataset)}.") # setup model if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") model = load_model(args.checkpoint, config) logging.info(f"Loaded model parameters from {args.checkpoint}.") model.remove_weight_norm() model = model.eval().to(device) # start generation total_rtf = 0.0 with torch.no_grad(), tqdm(dataset, desc="[decode]") as pbar: for idx, (utt_id, c) in enumerate(pbar, 1): # generate c = torch.tensor(c, dtype=torch.float).to(device) start = time.time() y = model.inference(c).view(-1) rtf = (time.time() - start) / (len(y) / config["sampling_rate"]) pbar.set_postfix({"RTF": rtf}) total_rtf += rtf # save as PCM 16 bit wav file sf.write(os.path.join(config["outdir"], f"{utt_id}_gen.wav"), y.cpu().numpy(), config["sampling_rate"], "PCM_16") # report average RTF logging.info( f"Finished generation of {idx} utterances (RTF = {total_rtf / idx:.03f})." )
import os import torch import soundfile as sf from espnet_model_zoo.downloader import ModelDownloader from espnet2.bin.tts_inference import Text2Speech from parallel_wavegan.utils import download_pretrained_model from parallel_wavegan.utils import load_model d = ModelDownloader() text2speech = Text2Speech( **d.download_and_unpack(tag), device="cuda", speed_control_alpha=1.0, ) text2speech.spc2wav = None # Disable griffin-lim vocoder = load_model(download_pretrained_model(vocoder_tag)).to("cuda").eval() vocoder.remove_weight_norm() while True: conn, addr = s.accept() data = conn.recv(1024) encoding = 'utf-8' data = str(data, encoding) conn.close() # synthesis with torch.no_grad(): start = time.time() wav, c, *_ = text2speech(data) wav = vocoder.inference(c) rtf = (time.time() - start) / (len(wav) / fs) print(f"RTF = {rtf:5f}")
def get_vocoder(self): vocoder = load_model(self.vocoder_model_path).to(self.device).eval() vocoder.remove_weight_norm() return vocoder
def __init__(self, model_dir, device="cpu"): self.device = device if isinstance(model_dir, str): model_dir = Path(model_dir) # search for config.yaml assert model_dir / "config.yaml" self.config = OmegaConf.load(model_dir / "config.yaml") # qst self.binary_dict, self.numeric_dict = hts.load_question_set(model_dir / "qst.hed") self.pitch_idx = len(self.binary_dict) + 1 self.pitch_indices = np.arange(len(self.binary_dict), len(self.binary_dict) + 3) # Time-lag model self.timelag_config = OmegaConf.load(model_dir / "timelag_model.yaml") self.timelag_model = instantiate(self.timelag_config.netG).to(device) checkpoint = torch.load( model_dir / "timelag_model.pth", map_location=device, ) self.timelag_model.load_state_dict(checkpoint["state_dict"]) self.timelag_in_scaler = MinMaxScaler( np.load(model_dir / "in_timelag_scaler_min.npy"), np.load(model_dir / "in_timelag_scaler_scale.npy"), ) self.timelag_out_scaler = StandardScaler( np.load(model_dir / "out_timelag_scaler_mean.npy"), np.load(model_dir / "out_timelag_scaler_var.npy"), np.load(model_dir / "out_timelag_scaler_scale.npy"), ) self.timelag_model.eval() # Duration model self.duration_config = OmegaConf.load(model_dir / "duration_model.yaml") self.duration_model = instantiate(self.duration_config.netG).to(device) checkpoint = torch.load( model_dir / "duration_model.pth", map_location=device, ) self.duration_model.load_state_dict(checkpoint["state_dict"]) self.duration_in_scaler = MinMaxScaler( np.load(model_dir / "in_duration_scaler_min.npy"), np.load(model_dir / "in_duration_scaler_scale.npy"), ) self.duration_out_scaler = StandardScaler( np.load(model_dir / "out_duration_scaler_mean.npy"), np.load(model_dir / "out_duration_scaler_var.npy"), np.load(model_dir / "out_duration_scaler_scale.npy"), ) self.duration_model.eval() # Acoustic model self.acoustic_config = OmegaConf.load(model_dir / "acoustic_model.yaml") self.acoustic_model = instantiate(self.acoustic_config.netG).to(device) checkpoint = torch.load( model_dir / "acoustic_model.pth", map_location=device, ) self.acoustic_model.load_state_dict(checkpoint["state_dict"]) self.acoustic_in_scaler = MinMaxScaler( np.load(model_dir / "in_acoustic_scaler_min.npy"), np.load(model_dir / "in_acoustic_scaler_scale.npy"), ) self.acoustic_out_scaler = StandardScaler( np.load(model_dir / "out_acoustic_scaler_mean.npy"), np.load(model_dir / "out_acoustic_scaler_var.npy"), np.load(model_dir / "out_acoustic_scaler_scale.npy"), ) self.acoustic_model.eval() # Post-filter if (model_dir / "postfilter_model.yaml").exists(): self.postfilter_config = OmegaConf.load(model_dir / "postfilter_model.yaml") self.postfilter_model = instantiate( self.postfilter_config.netG).to(device) checkpoint = torch.load( model_dir / "postfilter_model.pth", map_location=device, ) self.postfilter_model.load_state_dict(checkpoint["state_dict"]) self.postfilter_model.eval() self.postfilter_out_scaler = StandardScaler( np.load(model_dir / "out_postfilter_scaler_mean.npy"), np.load(model_dir / "out_postfilter_scaler_var.npy"), np.load(model_dir / "out_postfilter_scaler_scale.npy"), ) else: self.postfilter_model = None # Vocoder model if (model_dir / "vocoder_model.yaml").exists(): if not _pwg_available: warn( "parallel_wavegan is not installed. Vocoder model is disabled." ) self.vocoder = None else: self.vocoder_config = OmegaConf.load(model_dir / "vocoder_model.yaml") self.vocoder = load_model( model_dir / "vocoder_model.pth", config=self.vocoder_config).to(device) self.vocoder.eval() self.vocoder.remove_weight_norm() self.vocoder_in_scaler = StandardScaler( np.load(model_dir / "in_vocoder_scaler_mean.npy"), np.load(model_dir / "in_vocoder_scaler_var.npy"), np.load(model_dir / "in_vocoder_scaler_scale.npy"), ) else: self.vocoder = None
fs = 22050 def get_args(): parser = argparse.ArgumentParser( description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--text", help="your text", required=True) print(' '.join(sys.argv)) args = parser.parse_args() return args ## specify the path to vocoder's checkpoint vocoder_checkpoint = "exp/vocoder/checkpoint-400000steps.pkl" vocoder = load_model(vocoder_checkpoint).to("cuda").eval() vocoder.remove_weight_norm() ## specify path to the main model(transformer/tacotron2/fastspeech) and its config file config_file = "exp/tts_train_raw_char/config.yaml" model_path = "exp/tts_train_raw_char/train.loss.ave_5best.pth" text2speech = Text2Speech( config_file, model_path, device="cuda", # Only for Tacotron 2 threshold=0.5, minlenratio=0.0, maxlenratio=10.0, use_att_constraint=True,
def validate_one_epoch( cls, model: torch.nn.Module, iterator: Iterable[Dict[str, torch.Tensor]], reporter: SubReporter, options: TrainerOptions, distributed_option: DistributedOption, ) -> None: assert check_argument_types() ngpu = options.ngpu no_forward_run = options.no_forward_run distributed = distributed_option.distributed model.eval() ############################# ### setup vocoder model ### ############################# print(f"options: {options}") if options.vocoder_checkpoint != "": # load config if options.vocoder_config == "": dirname = os.path.dirname(options.vocoder_checkpoint) print(f"dirname: {dirname}") options.vocoder_config = os.path.join(dirname, "config.yml") logging.info(f"options.vocoder_config: {options.vocoder_config}") with open(options.vocoder_config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(options)) model_vocoder = load_model(options.vocoder_checkpoint, config) logging.info( f"Loaded model parameters from {options.vocoder_checkpoint}.") # if options.normalize_before: # if True: # assert hasattr(model_vocoder, "mean"), "Feature stats are not registered." # assert hasattr(model_vocoder, "scale"), "Feature stats are not registered." model_vocoder.remove_weight_norm() model_vocoder = model_vocoder.eval().to( "cuda" if ngpu > 0 else "cpu") else: model_vocoder = None # [For distributed] Because iteration counts are not always equals between # processes, send stop-flag to the other processes if iterator is finished iterator_stop = torch.tensor(0).to("cuda" if ngpu > 0 else "cpu") for (index, batch) in iterator: assert isinstance(batch, dict), type(batch) if distributed: torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM) if iterator_stop > 0: break batch = to_device(batch, "cuda" if ngpu > 0 else "cpu") if no_forward_run: continue del_keys = [ "pitch_aug", "pitch_aug_lengths", "time_aug", "time_aug_lengths" ] for key in del_keys: if key in batch.keys(): del batch[key] retval = model(**batch, flag_IsValid=True) if isinstance(retval, dict): stats = retval["stats"] weight = retval["weight"] else: # _, stats, weight = retval _, stats, weight, spec_predicted, spec_gt, length = retval # monitor spec during validation stage # [batch size, max length, feat dim] spec_predicted_denorm, _ = model.normalize.inverse( spec_predicted.clone()) spec_gt_denorm, _ = model.normalize.inverse(spec_gt.clone()) cls.log_figure( model, model_vocoder, index[0], spec_predicted_denorm, spec_gt_denorm, length, Path(options.output_dir) / "valid", ) if ngpu > 1 or distributed: # Apply weighted averaging for stats. # if distributed, this method can also apply all_reduce() stats, weight = recursive_average(stats, weight, distributed) reporter.register(stats, weight) reporter.next() else: if distributed: iterator_stop.fill_(1) torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM)
device=mlDevice, # Only for Tacotron 2 threshold=0.5, minlenratio=0.0, maxlenratio=10.0, use_att_constraint=False, backward_window=1, forward_window=3, # Only for FastSpeech & FastSpeech2 speed_control_alpha=1.0, ) text2speech.spc2wav = None # Disable griffin-lim # NOTE: Sometimes download is failed due to "Permission denied". That is # the limitation of google drive. Please retry after serveral hours. vocoder = load_model( download_pretrained_model( vocoder_tag, download_dir='./vocoder_models')).to(mlDevice).eval() vocoder.remove_weight_norm() import scipy.io.wavfile as wv import os if os.path.isfile(out_name + ".wav"): os.remove(out_name + ".wav") from concurrent.futures import ThreadPoolExecutor executor = ThreadPoolExecutor(max_workers=5) def save_wav(wav, count=-1): # print("Outputing wav file...") out_arr = wav.view(-1).cpu().numpy()