def multi_parse(output_dir, encoder_path, img_list): saved_img_f_list = [ output_dir / img.stem for img in img_list if not os.path.exists(str(output_dir / img.stem) + '.npy') ] if len(saved_img_f_list) == 0: return # https://stackoverflow.com/questions/50412477/python-multiprocessing-grab-free-gpu if torch.cuda.is_available(): cpu_name = multiprocessing.current_process().name cpu_id = int(cpu_name[cpu_name.find('-') + 1:]) gpu_id = cpu_id % torch.cuda.device_count() device = torch.device( "cuda:{}".format(gpu_id) if torch.cuda.is_available() else "cpu") else: device = torch.device("cpu") encoder.load_model(encoder_path, multi_gpu=False, device=device) input_imgs = np.array([ process_img(cv2.imread(str(img)), 224) / 255. for img in img_list if not os.path.exists(str(output_dir / img.stem) + '.npy') ]) print(input_imgs.shape) embeddings = encoder.embed_imgs(input_imgs) torch.cuda.empty_cache() [ np.save(saved_img_f_list[i], embeddings[i]) for i in range(len(embeddings)) ]
def get_model(): model_save_path = Path( '/scratch2/chowdh51/Code/DeepTalk/Deployment/encoder/saved_models/model_GST.pt' ) module_name = 'model_GST' encoder.load_model(model_save_path, module_name=module_name) return encoder
def load_model(in_fpath, parser): parser.add_argument("-e", "--enc_model_fpath", type=Path, default="encoder/saved_models/pretrained.pt", help="Path to a saved encoder") parser.add_argument("-s", "--syn_model_dir", type=Path, default="synthesizer/saved_models/logs-pretrained/", help="Directory containing the synthesizer model") parser.add_argument("-v", "--voc_model_fpath", type=Path, default="vocoder/saved_models/pretrained/pretrained.pt", help="Path to a saved vocoder") parser.add_argument("--low_mem", action="store_true", help=\ "If True, the memory used by the synthesizer will be freed after each use. Adds large " "overhead but allows to save some GPU memory for lower-end GPUs.") parser.add_argument("--no_sound", action="store_true", help=\ "If True, audio won't be played.") args = parser.parse_args() encoder.load_model(args.enc_model_fpath) synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem) vocoder.load_model(args.voc_model_fpath) preprocessed_wav = encoder.preprocess_wav(in_fpath) original_wav, sampling_rate = librosa.load(in_fpath) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) embed = encoder.embed_utterance(preprocessed_wav) return synthesizer, sampling_rate, embed
def DeepTalk_encoder(file_path, model_save_path, module_name, preprocess=True, normalize=True, sampling_rate=8000, duration=None): encoder.load_model(model_save_path, module_name=module_name) if (preprocess): wav = Synthesizer.load_preprocess_wav(file_path) ref_audio = encoder.preprocess_wav(wav) else: ref_audio, sr = librosa.load(file_path, sr=sampling_rate) if (duration is not None): ref_audio = ref_audio[0:int(duration * sampling_rate)] embed, partial_embeds, _ = encoder.embed_utterance(ref_audio, using_partials=True, return_partials=True) if (normalize): embed = embed / np.linalg.norm(embed) return embed
def load_models(self): if not torch.cuda.is_available(): print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready " "for deep learning, ensure that the drivers are properly installed, and that your " "CUDA version matches your PyTorch installation. CPU-only inference is currently " "not supported.", file=sys.stderr) quit(-1) device_id = torch.cuda.current_device() gpu_properties = torch.cuda.get_device_properties(device_id) print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " "%.1fGb total memory.\n" % (torch.cuda.device_count(), device_id, gpu_properties.name, gpu_properties.major, gpu_properties.minor, gpu_properties.total_memory / 1e9)) ## Load the models one by one. print("Preparing the encoder, the synthesizer and the vocoder...") encoder.load_model(self.enc_model_fpath) print("Loaded Encoder") self.synthesizer = Synthesizer(self.syn_model_dir.joinpath("taco_pretrained"), low_mem=self.low_mem) print("Loaded Synth") vocoder.load_model(self.voc_model_fpath) print("Loaded Vocoder")
def __init__(self): if (Text2SpeechModel == "dc_tts"): self.g = Graph(mode="synthesize") print("Text2Speech Tensorflow Graph loaded") elif (Text2SpeechModel == "RTVC"): enc_model_fpath = os.path.join( root_file_path, "RTVC", "encoder/saved_models/pretrained.pt") syn_model_dir = os.path.join( root_file_path, "RTVC", "synthesizer/saved_models/logs-pretrained") voc_model_fpath = os.path.join( root_file_path, "RTVC", "vocoder/saved_models/pretrained/pretrained.pt") encoder.load_model(enc_model_fpath) self.synthesizer = Synthesizer(os.path.join( syn_model_dir, "taco_pretrained"), low_mem=False) vocoder.load_model(voc_model_fpath) in_fpath = os.path.join("/", *root_file_path.split("/")[:-1], "REF/refaudioRTVC/ref.wav") preprocessed_wav = encoder.preprocess_wav(in_fpath) original_wav, sampling_rate = librosa.load(in_fpath) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) embed = encoder.embed_utterance(preprocessed_wav) self.embeds = [embed] elif (Text2SpeechModel == "AudioSynth"): taco_pretrained_config_path = os.path.join( root_file_path, 'AudioSynth/TensorFlowTTS/examples/tacotron2/conf/tacotron2.v1.yaml' ) tacotron2_config = AutoConfig.from_pretrained( taco_pretrained_config_path) taco_path = os.path.join(root_file_path, "AudioSynth/tacotron2-120k.h5") self.tacotron2 = TFAutoModel.from_pretrained( config=tacotron2_config, pretrained_path=taco_path, training=False, name="tacotron2") melgan_stft_pretrained_config_path = os.path.join( root_file_path, 'AudioSynth/TensorFlowTTS/examples/melgan.stft/conf/melgan.stft.v1.yaml' ) melgan_stft_config = AutoConfig.from_pretrained( melgan_stft_pretrained_config_path) melgan_stft_path = os.path.join(root_file_path, "AudioSynth/melgan.stft-2M.h5") self.melgan_stft = TFAutoModel.from_pretrained( config=melgan_stft_config, pretrained_path=melgan_stft_path, name="melgan_stft") self.processor = AutoProcessor.from_pretrained( pretrained_path=os.path.join( root_file_path, "AudioSynth/ljspeech_mapper.json")) mels, alignment_history, audios = do_synthesis( "Hello, how can I help you today?", self.tacotron2, self.melgan_stft, "TACOTRON", "MELGAN-STFT", self.processor)
def transform_embed(wav, encoder_model_fpath=Path()): from encoder import inference as encoder if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) wav = encoder.preprocess_wav(wav) embed = encoder.embed_utterance(wav) return embed
def setup(): global synthesizer encoder_weights = Path("encoder/saved_models/pretrained.pt") vocoder_weights = Path("vocoder/saved_models/pretrained/pretrained.pt") syn_dir = Path("synthesizer/saved_models/logs-pretrained/taco_pretrained") encoder.load_model(encoder_weights) synthesizer = Synthesizer(syn_dir) vocoder.load_model(vocoder_weights)
def init_encoder(self): model_fpath = self.ui.current_encoder_fpath self.ui.log("Loading the encoder %s... " % model_fpath) self.ui.set_loading(1) start = timer() encoder.load_model(model_fpath) self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append") self.ui.set_loading(0)
def embed_utterance(fpaths, encoder_model_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) # Compute the speaker embedding of the utterance wav_fpath = fpaths wav, rate = librosa.load(wav_fpath) wav = encoder.preprocess_wav(wav, rate) return encoder.embed_utterance(wav)
def extract_utterance_feats_spkr(self, data_utterance_path, is_full_ppg=False): """Get PPG and Mel (+ optional F0) for an utterance. Args: data_utterance_path: The path to the data utterance protocol buffer. is_full_ppg: If True, will use the full PPGs. Returns: feat_pairs: A list, each is a [pps, mel, dvec(spkr embedding)] pair. """ utt = Utterance() fs, wav = wavfile.read(data_utterance_path) utt.fs = fs utt.wav = wav utt.ppg = get_ppg(data_utterance_path, self.ppg_deps) audio = torch.FloatTensor(utt.wav.astype(np.float32)) fs = utt.fs if fs != self.stft.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( fs, self.stft.sampling_rate)) audio_norm = audio / self.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) # (1, n_mel_channels, T) acoustic_feats = self.stft.mel_spectrogram(audio_norm) # (n_mel_channels, T) acoustic_feats = torch.squeeze(acoustic_feats, 0) # (T, n_mel_channels) acoustic_feats = acoustic_feats.transpose(0, 1) #print("encoder model path", self.encoder_model_fpath) from encoder import inference as encoder if not encoder.is_loaded(): encoder.load_model(self.encoder_model_fpath) #wav = np.load(data_utterance_path) wav = encoder.preprocess_wav(data_utterance_path) # wav embed = encoder.embed_utterance(wav) #print("spkr embedding", embed) #print("shape of ppg, acoustic feats and spkr embedding", (utt.ppg).shape, acoustic_feats.shape, embed.shape) if is_full_ppg: if self.is_append_f0: ppg_f0 = append_ppg(utt.ppg, utt.f0) return [ppg_f0, acoustic_feats, embed] else: return [utt.ppg, acoustic_feats, embed] else: if self.is_append_f0: ppg_f0 = append_ppg(utt.monophone_ppg, utt.f0) return [ppg_f0, acoustic_feats, embed] else: return [utt.monophone_ppg, acoustic_feats, embed]
def embed_utterance(fpaths, encoder_model_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) # Compute the speaker embedding of the utterance wav_fpath, embed_fpath = fpaths wav = np.load(wav_fpath) wav = encoder.preprocess_wav(wav) embed = encoder.embed_utterance(wav) np.save(embed_fpath, embed, allow_pickle=False)
def get_spk_embed(load_path, enc_model_fpath): file_name = load_path.split('/')[-1] wav = load_wav(load_path) encoder.load_model(enc_model_fpath) preprocessed_wav = encoder.preprocess_wav(load_path) embed = encoder.embed_utterance(preprocessed_wav) spk_embd = torch.tensor(embed).unsqueeze(0) return spk_embd, file_name
def get_embed(self, wav): # from encoder import inference as encoder if not encoder.is_loaded(): encoder.load_model(self.encoder_model_fpath, device='cpu') # 用cpu避免以下报错。 # "RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the ‘spawn’ start method" wav = encoder.preprocess_wav(wav) embed = encoder.embed_utterance(wav) return embed
def embed_utterance(fpaths, encoder_model_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) # Compute the speaker embedding of the utterance wav_fpath = embed_fpath = fpaths embed_fpath = embed_fpath.replace(".wav", ".npy") wav, rate = librosa.load(wav_fpath) wav = encoder.preprocess_wav(wav, rate) embed = encoder.embed_utterance(wav) np.save(embed_fpath, embed, allow_pickle=False)
def embed_utterance(src, skip_existing=True, encoder_model_fpath=Path()): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) wav_fpath, embed_fpath = src if skip_existing and embed_fpath.is_file(): return wav = aukit.load_wav(wav_fpath, sr=hp.sampling_rate) wav = encoder.preprocess_wav(wav) embed = encoder.embed_utterance(wav) np.save(embed_fpath, embed, allow_pickle=False)
def __init__(self): # Info & args enc_model_fpath = Path("encoder/saved_models/pretrained.pt") syn_model_dir = Path("synthesizer/saved_models/logs-pretrained/") voc_model_fpath = Path("vocoder/saved_models/pretrained/pretrained.pt") low_mem = False ## Load the models one by one. print("Preparing the encoder, the synthesizer and the vocoder...") encoder.load_model(enc_model_fpath) self.synthesizer = Synthesizer(syn_model_dir.joinpath("taco_pretrained"), low_mem=low_mem) vocoder.load_model(voc_model_fpath)
def embed_utterance(fpaths, encoder_model_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) # Compute the speaker embedding of the utterance wav_fpath, embed_fpath, _ = fpaths # try: # wav = np.load(wav_fpath) # except ValueError as e: # print(e) # wav = np.load(wav_fpath, allow_pickle=True) wav = encoder.preprocess_wav(wav_fpath) embed = encoder.embed_utterance(wav) np.save(embed_fpath, embed, allow_pickle=False)
def signup(wav_fpath: Path, username, encoder_model_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) # Compute the speaker embedding of the utterance embed_fpath = signup_dir.joinpath(username + ".npy") wav = encoder.preprocess_wav(str(wav_fpath)) embed = encoder.embed_utterance(wav) if os.path.exists(embed_fpath): old_embed = np.load(embed_fpath) embed = old_embed + embed embed /= np.linalg.norm(embed, 2) os.remove(embed_fpath) np.save(embed_fpath, embed, allow_pickle=False) print(username + " signed up.")
def transform_embed(wav, encoder_model_fpath=Path()): # from encoder import inference as encoder if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) wav_ = encoder.preprocess_wav(wav) # Take segment segment_length = 2 * encoder.sampling_rate # 随机选取2秒语音生成语音表示向量 if len(wav_) > segment_length: max_audio_start = len(wav_) - segment_length audio_start = random.randint(0, max_audio_start) wav_ = wav_[audio_start:audio_start + segment_length] embed = encoder.embed_utterance(wav_) return embed
def embed_utterance(fpaths, encoder_model_fpath, hparams): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) # Compute the speaker embedding of the utterance wav_fpath, embed_fpath = fpaths if embed_fpath.exists(): return # wav = np.load(wav_fpath) wav, _ = librosa.load(wav_fpath, hparams.sample_rate) if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max wav = encoder.preprocess_wav(wav) embed = encoder.embed_utterance(wav) np.save(embed_fpath, embed, allow_pickle=False)
def initialize(self): print("Running a test of your configuration...\n") if not torch.cuda.is_available(): print( "Your PyTorch installation is not configured to use CUDA. If you have a GPU ready " "for deep learning, ensure that the drivers are properly installed, and that your " "CUDA version matches your PyTorch installation. CPU-only inference is currently " "not supported.") quit(-1) print("PyTorch is available and working...") device_id = torch.cuda.current_device() gpu_properties = torch.cuda.get_device_properties(device_id) print( "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " "%.1fGb total memory.\n" % (torch.cuda.device_count(), device_id, gpu_properties.name, gpu_properties.major, gpu_properties.minor, gpu_properties.total_memory / 1e9)) ## Load the models one by one. print("Preparing the encoder, the synthesizer and the vocoder...") encoder.load_model(self.enc_model_fpath) vocoder.load_model(self.voc_model_fpath) ## Run a test print("Testing your configuration with small inputs.") print("\tTesting the encoder...") encoder.embed_utterance(np.zeros(encoder.sampling_rate)) embed = np.random.rand(speaker_embedding_size) embed /= np.linalg.norm(embed) embeds = [embed, np.zeros(speaker_embedding_size)] texts = ["test 1", "test 2"] print( "\tTesting the synthesizer... (loading the model will output a lot of text)" ) mels = self.synthesizer.synthesize_spectrograms(texts, embeds) mel = np.concatenate(mels, axis=1) no_action = lambda *args: None print("\tTesting the vocoder...") vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action) print("All test passed! You can now synthesize speech.\n\n")
def get_embed(self, wav): # from encoder import inference as encoder if not encoder.is_loaded(): encoder.load_model(self.encoder_model_fpath, device='cpu') # 用cpu避免以下报错。 # "RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the ‘spawn’ start method" wav_ = encoder.preprocess_wav(wav) # Take segment segment_length = 2 * encoder.sampling_rate # 随机选取2秒语音生成语音表示向量 if len(wav_) > segment_length: max_audio_start = len(wav_) - segment_length audio_start = random.randint(0, max_audio_start) wav_ = wav_[audio_start:audio_start + segment_length] embed = encoder.embed_utterance(wav_) return embed
def embed_utterance(fpaths, encoder_model_fpath, module_name, reject_list_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath, module_name=module_name) # Compute the speaker embedding of the utterance wav_fpath, embed_fpath = fpaths wav = np.load(wav_fpath) wav = encoder.preprocess_wav(wav) embed = encoder.embed_utterance(wav) if(embed is None): with open(reject_list_fpath, 'a') as reject_file: reject_file.write(str(os.path.basename(embed_fpath))+'\n') reject_file.close() else: if(embed.shape[0]==128): embed = np.concatenate((embed, embed), axis=0) np.save(embed_fpath, embed, allow_pickle=False)
def setup(): # Parse configs. Globals nicer in this case with open("flowtron/infer.json") as f: data = f.read() global config config = json.loads(data) global data_config data_config = config["data_config"] global model_config model_config = config["model_config"] torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = False global flowtron global waveglow global trainset encoder_weights = Path("encoder/saved_models/pretrained.pt") encoder.load_model(encoder_weights) torch.manual_seed(1234) torch.cuda.manual_seed(1234) #Load waveglow waveglow = torch.load("flowtron/tacotron2/waveglow/saved_models/waveglow_256channels_universal_v5.pt")['model'].cuda().eval() waveglow.cuda().half() for k in waveglow.convinv: k.float() waveglow.eval() #Load flowtron flowtron = Flowtron(**model_config).cuda() state_dict = torch.load("flowtron/saved_models/pretrained.pt", map_location='cpu')['model'].state_dict() flowtron.load_state_dict(state_dict) flowtron.eval() ignore_keys = ['training_files', 'validation_files'] trainset = Data( data_config['training_files'], **dict((k, v) for k, v in data_config.items() if k not in ignore_keys))
def embed_voice(self): encoder.load_model("encoder/saved_models/pretrained.pt") in_fpath = Path(self.voice_file) ## Computing the embedding # First, we load the wav using the function that the speaker encoder provides. This is # important: there is preprocessing that must be applied. # The following two methods are equivalent: # - Directly load from the filepath: preprocessed_wav = encoder.preprocess_wav(in_fpath) # - If the wav is already loaded: original_wav, sampling_rate = librosa.load(in_fpath) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) # Then we derive the embedding. There are many functions and parameters that the # speaker encoder interfaces. These are mostly for in-depth research. You will typically # only use this function (with its default parameters): embed = encoder.embed_utterance(preprocessed_wav) return embed
def gen_from_file(model, load_path, enc_model_fpath, save_path, batched, target, overlap): k = model.get_step() // 1000 file_name = load_path.split('/')[-1] wav = load_wav(load_path) #save_wav(wav, f'{save_path}__{file_name}__{k}k_steps_target.wav') encoder.load_model(enc_model_fpath) preprocessed_wav = encoder.preprocess_wav(load_path) embed = encoder.embed_utterance(preprocessed_wav) spk_embd = torch.tensor(embed).unsqueeze(0) mel = melspectrogram(wav) mel = torch.tensor(mel).unsqueeze(0) batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED' #save_str = f'{save_path}__{file_name}__{k}k_steps_{batch_str}_spk_embed.wav' save_str = f'{file_name}' _ = model.generate(mel, spk_embd, save_str, batched, target, overlap, hp.mu_law)
def signin(wav_or_wavpath, encoder_model_fpath): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) wav = encoder.preprocess_wav(wav_or_wavpath) embed = encoder.embed_utterance(wav) embed = np.reshape(embed, [np.shape(embed)[0], 1]) # [emb_dim, 1] signed_spk_embs = list(signup_dir.glob("*.npy")) signed_spk_name = [_dir.stem for _dir in signed_spk_embs] signed_spk_embs = [np.load(str(_dir)) for _dir in signed_spk_embs] signed_spk_embs = np.array(signed_spk_embs) # [n, emb_dim] print(signed_spk_name) print(np.shape(signed_spk_embs), np.shape(embed)) similar_score = np.matmul(signed_spk_embs, embed) similar_score = np.reshape(similar_score, [-1]) sim_id = np.argmax(similar_score) sim_name = signed_spk_name[sim_id] for name, score in zip(signed_spk_name, similar_score): print(name, score) print("\nMatching name: ", sim_name)
def clone(audio=None, audio_url=None, sentence=""): try: if not 10 <= len(sentence.split(" ")) <= 30: return {"error": "Sentence is invalid! (length must be 10 to 30 words)"} audio_data = audio if audio_url: # Link if "http://" in audio_url or "https://" in audio_url: header = {'User-Agent': 'Mozilla/5.0 (Windows NT x.y; Win64; x64; rv:9.0) Gecko/20100101 Firefox/10.0'} # Check if audio file has less than 5Mb r = requests.head(audio_url, headers=header, allow_redirects=True) size = r.headers.get('content-length', 0) size = int(size) / float(1 << 20) log.info("File size: {:.2f} Mb".format(size)) if size > 10: return {"error": "Input audio file is too large! (max 10Mb)"} r = requests.get(audio_url, headers=header, allow_redirects=True) audio_data = r.content # Base64 elif len(audio_url) > 500: audio_data = base64.b64decode(audio_url) audio_path = generate_uid() + ".audio" with open(audio_path, "wb") as f: f.write(audio_data) # Load the models one by one. log.info("Preparing the encoder, the synthesizer and the vocoder...") encoder.load_model(Path("rtvc/encoder/saved_models/pretrained.pt")) synthesizer = Synthesizer(Path("rtvc/synthesizer/saved_models/logs-pretrained/taco_pretrained")) vocoder.load_model(Path("rtvc/vocoder/saved_models/pretrained/pretrained.pt")) # Computing the embedding original_wav, sampling_rate = librosa.load(audio_path) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) log.info("Loaded file successfully") if os.path.exists(audio_path): os.remove(audio_path) embed = encoder.embed_utterance(preprocessed_wav) log.info("Created the embedding") specs = synthesizer.synthesize_spectrograms([sentence], [embed]) spec = np.concatenate(specs, axis=1) # spec = specs[0] log.info("Created the mel spectrogram") # Generating the waveform log.info("Synthesizing the waveform:") generated_wav = vocoder.infer_waveform(spec, progress_callback=lambda *args: None) # Post-generation # There's a bug with sounddevice that makes the audio cut one second earlier, so we # pad it. generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") # Save it on the disk fp = tempfile.TemporaryFile() librosa.output.write_wav(fp, generated_wav.astype(np.float32), synthesizer.sample_rate) return {"audio": fp.read()} except Exception as e: log.error(e) traceback.print_exc() return {"error": "Fail"}
"CUDA version matches your PyTorch installation. CPU-only inference is currently " "not supported.", file=sys.stderr) quit(-1) device_id = torch.cuda.current_device() gpu_properties = torch.cuda.get_device_properties(device_id) print( "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " "%.1fGb total memory.\n" % (torch.cuda.device_count(), device_id, gpu_properties.name, gpu_properties.major, gpu_properties.minor, gpu_properties.total_memory / 1e9)) ## Load the models one by one. print("Preparing the encoder, the synthesizer and the vocoder...") encoder.load_model(args.enc_model_fpath) synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem) #vocoder.load_model(args.voc_model_fpath) ## Run a test print("Testing your configuration with small inputs.") # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's # sampling rate, which may differ. # If you're unfamiliar with digital audio, know that it is encoded as an array of floats # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1. # The sampling rate is the number of values (samples) recorded per second, it is set to # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond # to an audio of 1 second. print("\tTesting the encoder...") encoder.embed_utterance(np.zeros(encoder.sampling_rate))