def change_mode(character: str = "Human_Man", tone: str = "neutral"): training_dir = voices_dict[character]['ID'] tone_file = voices_dict[character]['tone'][tone] + '.flac' tone_dir = tone_file.split("-")[1] local_infpath = Path(f'{data_path}/{training_dir}/{tone_dir}/{tone_file}') global in_fpath, filenum, preprocessed_wav, embed, torch, vocoder if local_infpath != in_fpath and character is not None: if tone is None: tone = "neutral" print( f'Reference sound has changed; now loading {character}:{tone}...') with nostdout(): in_fpath = local_infpath preprocessed_wav = encoder.preprocess_wav(in_fpath) original_wav, sampling_rate = librosa.load(str(in_fpath)) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) embed = encoder.embed_utterance(preprocessed_wav) torch.manual_seed(seed) vocoder.load_model(vocoder_path) text_to_speech('Tea.', play_sound=False) else: print('Mode is already correct. No need to change.')
def load_models(self): if not torch.cuda.is_available(): print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready " "for deep learning, ensure that the drivers are properly installed, and that your " "CUDA version matches your PyTorch installation. CPU-only inference is currently " "not supported.", file=sys.stderr) quit(-1) device_id = torch.cuda.current_device() gpu_properties = torch.cuda.get_device_properties(device_id) print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " "%.1fGb total memory.\n" % (torch.cuda.device_count(), device_id, gpu_properties.name, gpu_properties.major, gpu_properties.minor, gpu_properties.total_memory / 1e9)) ## Load the models one by one. print("Preparing the encoder, the synthesizer and the vocoder...") encoder.load_model(self.enc_model_fpath) print("Loaded Encoder") self.synthesizer = Synthesizer(self.syn_model_dir.joinpath("taco_pretrained"), low_mem=self.low_mem) print("Loaded Synth") vocoder.load_model(self.voc_model_fpath) print("Loaded Vocoder")
def load_model(in_fpath, parser): parser.add_argument("-e", "--enc_model_fpath", type=Path, default="encoder/saved_models/pretrained.pt", help="Path to a saved encoder") parser.add_argument("-s", "--syn_model_dir", type=Path, default="synthesizer/saved_models/logs-pretrained/", help="Directory containing the synthesizer model") parser.add_argument("-v", "--voc_model_fpath", type=Path, default="vocoder/saved_models/pretrained/pretrained.pt", help="Path to a saved vocoder") parser.add_argument("--low_mem", action="store_true", help=\ "If True, the memory used by the synthesizer will be freed after each use. Adds large " "overhead but allows to save some GPU memory for lower-end GPUs.") parser.add_argument("--no_sound", action="store_true", help=\ "If True, audio won't be played.") args = parser.parse_args() encoder.load_model(args.enc_model_fpath) synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem) vocoder.load_model(args.voc_model_fpath) preprocessed_wav = encoder.preprocess_wav(in_fpath) original_wav, sampling_rate = librosa.load(in_fpath) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) embed = encoder.embed_utterance(preprocessed_wav) return synthesizer, sampling_rate, embed
def __init__(self): if (Text2SpeechModel == "dc_tts"): self.g = Graph(mode="synthesize") print("Text2Speech Tensorflow Graph loaded") elif (Text2SpeechModel == "RTVC"): enc_model_fpath = os.path.join( root_file_path, "RTVC", "encoder/saved_models/pretrained.pt") syn_model_dir = os.path.join( root_file_path, "RTVC", "synthesizer/saved_models/logs-pretrained") voc_model_fpath = os.path.join( root_file_path, "RTVC", "vocoder/saved_models/pretrained/pretrained.pt") encoder.load_model(enc_model_fpath) self.synthesizer = Synthesizer(os.path.join( syn_model_dir, "taco_pretrained"), low_mem=False) vocoder.load_model(voc_model_fpath) in_fpath = os.path.join("/", *root_file_path.split("/")[:-1], "REF/refaudioRTVC/ref.wav") preprocessed_wav = encoder.preprocess_wav(in_fpath) original_wav, sampling_rate = librosa.load(in_fpath) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) embed = encoder.embed_utterance(preprocessed_wav) self.embeds = [embed] elif (Text2SpeechModel == "AudioSynth"): taco_pretrained_config_path = os.path.join( root_file_path, 'AudioSynth/TensorFlowTTS/examples/tacotron2/conf/tacotron2.v1.yaml' ) tacotron2_config = AutoConfig.from_pretrained( taco_pretrained_config_path) taco_path = os.path.join(root_file_path, "AudioSynth/tacotron2-120k.h5") self.tacotron2 = TFAutoModel.from_pretrained( config=tacotron2_config, pretrained_path=taco_path, training=False, name="tacotron2") melgan_stft_pretrained_config_path = os.path.join( root_file_path, 'AudioSynth/TensorFlowTTS/examples/melgan.stft/conf/melgan.stft.v1.yaml' ) melgan_stft_config = AutoConfig.from_pretrained( melgan_stft_pretrained_config_path) melgan_stft_path = os.path.join(root_file_path, "AudioSynth/melgan.stft-2M.h5") self.melgan_stft = TFAutoModel.from_pretrained( config=melgan_stft_config, pretrained_path=melgan_stft_path, name="melgan_stft") self.processor = AutoProcessor.from_pretrained( pretrained_path=os.path.join( root_file_path, "AudioSynth/ljspeech_mapper.json")) mels, alignment_history, audios = do_synthesis( "Hello, how can I help you today?", self.tacotron2, self.melgan_stft, "TACOTRON", "MELGAN-STFT", self.processor)
def setup(): global synthesizer encoder_weights = Path("encoder/saved_models/pretrained.pt") vocoder_weights = Path("vocoder/saved_models/pretrained/pretrained.pt") syn_dir = Path("synthesizer/saved_models/logs-pretrained/taco_pretrained") encoder.load_model(encoder_weights) synthesizer = Synthesizer(syn_dir) vocoder.load_model(vocoder_weights)
def load_models(): #encoder_weights = Path(encoder_path) vocoder_weights = Path(vocoder_path) syn_dir = Path(synthesizer_path) #encoder.load_model(encoder_weights) synthesizer = Synthesizer(syn_dir) vocoder.load_model(vocoder_weights) return encoder, synthesizer, vocoder
async def generate_wav(text, filename): user_id = "russell" embed_path = "user_data/embeds/{}.npy".format(user_id) embed_path = Path(embed_path) if embed_path.is_file(): embed = np.load(embed_path) print("load embedding in {}".format(embed_path)) else: raise ("user embedding not found") # ================== synthesizer ================== start_time = time.time() # The synthesizer works in batch, so you need to put your data in a list or numpy array texts = [text] embeds = [embed] # If you know what the attention layer alignments are, you can retrieve them here by # passing return_alignments=True specs = synthesizer.synthesize_spectrograms(texts, embeds) spec = specs[0] print("Created the mel spectrogram") print("--- synthesizer: %s seconds ---" % (time.time() - start_time)) # ================== vocoder ================== start_time = time.time() # If seed is specified, reset torch seed and reload vocoder if args.seed is not None: torch.manual_seed(args.seed) vocoder.load_model(args.voc_model_fpath) # Synthesizing the waveform is fairly straightforward. Remember that the longer the # spectrogram, the more time-efficient the vocoder. generated_wav = vocoder.infer_waveform(spec) print("") print("--- vocoder: %s seconds ---" % (time.time() - start_time)) # ================== post generation ================== start_time = time.time() # There's a bug with sounddevice that makes the audio cut one second earlier, so we # pad it. generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") # Trim excess silences to compensate for gaps in spectrograms (issue #53) generated_wav = encoder.preprocess_wav(generated_wav) print("--- post generation: %s seconds ---" % (time.time() - start_time)) sf.write("./user_data/generated_voice/%s/"%(user_id) + "%s.wav"%filename, \ generated_wav.astype(np.float32), synthesizer.sample_rate)
def init_vocoder(self): model_fpath = self.ui.current_vocoder_fpath # Case of Griffin-lim if model_fpath is None: return self.ui.log("Loading the vocoder %s... " % model_fpath) self.ui.set_loading(1) start = timer() vocoder.load_model(model_fpath) self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append") self.ui.set_loading(0)
def __init__(self): # Info & args enc_model_fpath = Path("encoder/saved_models/pretrained.pt") syn_model_dir = Path("synthesizer/saved_models/logs-pretrained/") voc_model_fpath = Path("vocoder/saved_models/pretrained/pretrained.pt") low_mem = False ## Load the models one by one. print("Preparing the encoder, the synthesizer and the vocoder...") encoder.load_model(enc_model_fpath) self.synthesizer = Synthesizer(syn_model_dir.joinpath("taco_pretrained"), low_mem=low_mem) vocoder.load_model(voc_model_fpath)
def DeepTalk_vocoder(synthesized_mel, breaks, model_save_path, normalize=True): vocoder.load_model(model_save_path) no_action = lambda *args: None wav1 = vocoder.infer_waveform(synthesized_mel, progress_callback=no_action, normalize=normalize) # Add breaks b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size) b_starts = np.concatenate(([0], b_ends[:-1])) wavs = [wav1[start:end] for start, end, in zip(b_starts, b_ends)] breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks) wav1 = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)]) wav1 = wav1 / np.abs(wav1).max() * 0.97 return wav1
def initialize(self): print("Running a test of your configuration...\n") if not torch.cuda.is_available(): print( "Your PyTorch installation is not configured to use CUDA. If you have a GPU ready " "for deep learning, ensure that the drivers are properly installed, and that your " "CUDA version matches your PyTorch installation. CPU-only inference is currently " "not supported.") quit(-1) print("PyTorch is available and working...") device_id = torch.cuda.current_device() gpu_properties = torch.cuda.get_device_properties(device_id) print( "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " "%.1fGb total memory.\n" % (torch.cuda.device_count(), device_id, gpu_properties.name, gpu_properties.major, gpu_properties.minor, gpu_properties.total_memory / 1e9)) ## Load the models one by one. print("Preparing the encoder, the synthesizer and the vocoder...") encoder.load_model(self.enc_model_fpath) vocoder.load_model(self.voc_model_fpath) ## Run a test print("Testing your configuration with small inputs.") print("\tTesting the encoder...") encoder.embed_utterance(np.zeros(encoder.sampling_rate)) embed = np.random.rand(speaker_embedding_size) embed /= np.linalg.norm(embed) embeds = [embed, np.zeros(speaker_embedding_size)] texts = ["test 1", "test 2"] print( "\tTesting the synthesizer... (loading the model will output a lot of text)" ) mels = self.synthesizer.synthesize_spectrograms(texts, embeds) mel = np.concatenate(mels, axis=1) no_action = lambda *args: None print("\tTesting the vocoder...") vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action) print("All test passed! You can now synthesize speech.\n\n")
def init_vocoder(self): model_fpath = self.ui.current_vocoder_fpath # Case of Griffin-lim if model_fpath is None: return else: self.ui.log("Loading the vocoder %s... " % model_fpath) self.ui.set_loading(1) start = timer() if Path(model_fpath).parent.stem == "melgan": vocoder_melgan.load_vocoder_melgan(model_fpath) elif Path(model_fpath).parent.stem == "wavernn": vocoder.load_model(model_fpath) else: return self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append") self.ui.set_loading(0)
def clone_voice(self, embed): synthesizer = Synthesizer("synthesizer/saved_models/logs-pretrained/taco_pretrained") vocoder.load_model("vocoder/saved_models/pretrained/pretrained.pt") with open(self.json_text) as text_json: data = json.load(text_json) for x in data: text = x['translation'] # The synthesizer works in batch, so you need to put your data in a list or numpy array texts = [text] embeds = [embed] # If you know what the attention layer alignments are, you can retrieve them here by # passing return_alignments=True specs = synthesizer.synthesize_spectrograms(texts, embeds) spec = specs[0] ## Generating the waveform print("\nSynthesizing the waveform:") # Synthesizing the waveform is fairly straightforward. Remember that the longer the # spectrogram, the more time-efficient the vocoder. generated_wav = vocoder.infer_waveform(spec) ## Post-generation # There's a bug with sounddevice that makes the audio cut one second earlier, so we # pad it. generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") # Save it on the disk output_dir = '../temp' try: if not os.path.exists(output_dir): os.makedirs(output_dir) except: pass fpath = "%s/%d.wav" % (output_dir, x['index']) generated_wav *= 32767 / max(0.01, np.max(np.abs(generated_wav))) wavfile.write(fpath, synthesizer.sample_rate, generated_wav.astype(np.int16))
(torch.cuda.device_count(), device_id, gpu_properties.name, gpu_properties.major, gpu_properties.minor, gpu_properties.total_memory / 1e9)) else: print("Using CPU for inference.\n") ## Remind the user to download pretrained models if needed ## Load the models one by one. print("Preparing the encoder, the synthesizer and the vocoder...") encoder.load_model(Path("encoder/saved_models/pretrained.pt")) synthesizer = Synthesizer(Path("synthesizer/saved_models/logs-pretrained/taco_pretrained")) vocoder.load_model(Path("vocoder/saved_models/pretrained/pretrained.pt")) try: # Get the reference audio filepath message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \ "wav, m4a, flac, ...):\n" in_fpath = Path("samples/elon_voice.wav") #hardcoded for now ## Computing the embedding # First, we load the wav using the function that the speaker encoder provides. This is # important: there is preprocessing that must be applied. # The following two methods are equivalent: # - Directly load from the filepath:
embeddings = json.load(f) # celebrities = ['Kevin Hart','Morgan Freeman','Tom Cruise'] celebrities = embeddings.keys() auth.authenticate_user() gauth = GoogleAuth() gauth.credentials = GoogleCredentials.get_application_default() drive = GoogleDrive(gauth) encoder_weights = Path("encoder/saved_models/pretrained.pt") vocoder_weights = Path("vocoder/saved_models/pretrained/pretrained.pt") syn_dir = Path("synthesizer/saved_models/logs-pretrained/taco_pretrained") encoder.load_model(encoder_weights) synthesizer = Synthesizer(syn_dir) vocoder.load_model(vocoder_weights) outfile = "/content/drive/My Drive/Real-Time-Voice-Cloning/samples/morgan-freeman-to-me-it's-just-a-made-up-word-a-politician's-word-so-that-young-fellas-like-yourself-can-wear-a-suit-and-a-tie-and-have-a-job.wav" in_fpath = Path(outfile) print("preprocessing the training audio file") # reprocessed_wav = encoder.preprocess_wav(in_fpath) original_wav, sampling_rate = librosa.load(in_fpath) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) embed = encoder.embed_utterance(preprocessed_wav) def write(f, sr, x, normalized=False): """numpy array to MP3""" channels = 2 if (x.ndim == 2 and x.shape[1] == 2) else 1 if normalized: # normalized array - each item should be a float in [-1, 1) y = np.int16(x * 2**15)
enc_model_fpath = Path("encoder/saved_models/pretrained.pt") syn_model_dir = Path("synthesizer/saved_models/logs-pretrained/") voc_model_fpath = Path("vocoder/saved_models/pretrained/pretrained.pt") # Load the models one by one ## Load the models one by one. print("Preparing the encoder, the synthesizer and the vocoder...") # encoder.load_model(args.enc_model_fpath) # synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem) # vocoder.load_model(args.voc_model_fpath) encoder.load_model(enc_model_fpath) synthesizer = Synthesizer(syn_model_dir.joinpath("taco_pretrained")) vocoder.load_model(voc_model_fpath) # Run a test print("Testing your configuration with small inputs.") # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's # sampling rate, which may differ. # If you're unfamiliar with digital audio, know that it is encoded as an array of floats # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1. # The sampling rate is the number of values (samples) recorded per second, it is set to # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond # to an audio of 1 second. print("\tTesting the encoder...") encoder.embed_utterance(np.zeros(encoder.sampling_rate)) # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
def maux(output_text, num): print("debug -- django") ## Info & args # parser = argparse.ArgumentParser( # formatter_class=argparse.ArgumentDefaultsHelpFormatter # ) # parser.add_argument("-e", "--enc_model_fpath", type=Path, # default="D:/RemindMe/django-remindme/mysite/trained model/encoder/saved_models/pretrained.pt", # help="Path to a saved encoder") # parser.add_argument("-s", "--syn_model_dir", type=Path, # default="D:/RemindMe/django-remindme/mysite/trained model/synthesizer/saved_models/logs-pretrained/", # help="Directory containing the synthesizer model") # parser.add_argument("-v", "--voc_model_fpath", type=Path, # default="D:/RemindMe/django-remindme/mysite/trained model/vocoder/saved_models/pretrained/pretrained.pt", # help="Path to a saved vocoder") # parser.add_argument("--low_mem", action="store_true", help=\ # "If True, the memory used by the synthesizer will be freed after each use. Adds large " # "overhead but allows to save some GPU memory for lower-end GPUs.") # parser.add_argument("--no_sound", action="store_true", help=\ # "If True, audio won't be played.") # args = parser.parse_args() # print_args(args, parser) # if not args.no_sound: # import sounddevice as sd ## Print some environment information (for debugging purposes) print("Running a test of your configuration...\n") if not torch.cuda.is_available(): print( "Your PyTorch installation is not configured to use CUDA. If you have a GPU ready " "for deep learning, ensure that the drivers are properly installed, and that your " "CUDA version matches your PyTorch installation. CPU-only inference is currently " "not supported.", file=sys.stderr) quit(-1) device_id = torch.cuda.current_device() gpu_properties = torch.cuda.get_device_properties(device_id) print( "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " "%.1fGb total memory.\n" % (torch.cuda.device_count(), device_id, gpu_properties.name, gpu_properties.major, gpu_properties.minor, gpu_properties.total_memory / 1e9)) ## Load the models one by one. print("Preparing the encoder, the synthesizer and the vocoder...") #encoder.load_model(args.enc_model_fpath) #synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem) #vocoder.load_model(args.voc_model_fpath) encoder.load_model( "D:/RemindMe/django-remindme/mysite/trained model/encoder/saved_models/pretrained.pt" ) synthesizer = Synthesizer( "D:/RemindMe/django-remindme/mysite/trained model/synthesizer/saved_models/logs-pretrained/taco_pretrained", low_mem=False) vocoder.load_model( "D:/RemindMe/django-remindme/mysite/trained model/vocoder/saved_models/pretrained/pretrained.pt" ) ## Run a test print("Testing your configuration with small inputs.") # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's # sampling rate, which may differ. # If you're unfamiliar with digital audio, know that it is encoded as an array of floats # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1. # The sampling rate is the number of values (samples) recorded per second, it is set to # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond # to an audio of 1 second. print("\tTesting the encoder...") encoder.embed_utterance(np.zeros(encoder.sampling_rate)) # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance # returns, but here we're going to make one ourselves just for the sake of showing that it's # possible. embed = np.random.rand(speaker_embedding_size) # Embeddings are L2-normalized (this isn't important here, but if you want to make your own # embeddings it will be). embed /= np.linalg.norm(embed) # The synthesizer can handle multiple inputs with batching. Let's create another embedding to # illustrate that embeds = [embed, np.zeros(speaker_embedding_size)] texts = ["test 1", "test 2"] print( "\tTesting the synthesizer... (loading the model will output a lot of text)" ) mels = synthesizer.synthesize_spectrograms(texts, embeds) # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We # can concatenate the mel spectrograms to a single one. mel = np.concatenate(mels, axis=1) # The vocoder can take a callback function to display the generation. More on that later. For # now we'll simply hide it like this: no_action = lambda *args: None print("\tTesting the vocoder...") # For the sake of making this test short, we'll pass a short target length. The target length # is the length of the wav segments that are processed in parallel. E.g. for audio sampled # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and # that has a detrimental effect on the quality of the audio. The default parameters are # recommended in general. vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action) print("All test passed! You can now synthesize speech.\n\n") ## Interactive speech generation print( "This is a GUI-less example of interface to SV2TTS. The purpose of this script is to " "show how you can interface this project easily with your own. See the source code for " "an explanation of what is happening.\n") print("Interactive generation loop") in_fpath = Path( "D:/RemindMe/django-remindme/mysite/trained model/sam_narration2.wav") preprocessed_wav = encoder.preprocess_wav(in_fpath) original_wav, sampling_rate = librosa.load(in_fpath) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) print("Loaded file succesfully") embed = encoder.embed_utterance(preprocessed_wav) print("Created the embedding") embeds = [embed] text = output_text texts = [text] specs = synthesizer.synthesize_spectrograms(texts, embeds) spec = specs[0] print("Created the mel spectrogram") ## Generating the waveform print("Synthesizing the waveform:") # Synthesizing the waveform is fairly straightforward. Remember that the longer the # spectrogram, the more time-efficient the vocoder. generated_wav = vocoder.infer_waveform(spec) ## Post-generation # There's a bug with sounddevice that makes the audio cut one second earlier, so we # pad it. generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") # Play the audio (non-blocking) # Save it on the disk filexpath = "D:/RemindMe/django_remindme_model/mysite/media/demo_output_%02d.wav" % num fx = "demo_output_%02d" % num print(generated_wav.dtype) librosa.output.write_wav(filexpath, generated_wav.astype(np.float32), synthesizer.sample_rate) print("\nSaved output as %s\n\n" % filexpath) return fx
def clone(audio=None, audio_url=None, sentence=""): try: if not 10 <= len(sentence.split(" ")) <= 30: return {"error": "Sentence is invalid! (length must be 10 to 30 words)"} audio_data = audio if audio_url: # Link if "http://" in audio_url or "https://" in audio_url: header = {'User-Agent': 'Mozilla/5.0 (Windows NT x.y; Win64; x64; rv:9.0) Gecko/20100101 Firefox/10.0'} # Check if audio file has less than 5Mb r = requests.head(audio_url, headers=header, allow_redirects=True) size = r.headers.get('content-length', 0) size = int(size) / float(1 << 20) log.info("File size: {:.2f} Mb".format(size)) if size > 10: return {"error": "Input audio file is too large! (max 10Mb)"} r = requests.get(audio_url, headers=header, allow_redirects=True) audio_data = r.content # Base64 elif len(audio_url) > 500: audio_data = base64.b64decode(audio_url) audio_path = generate_uid() + ".audio" with open(audio_path, "wb") as f: f.write(audio_data) # Load the models one by one. log.info("Preparing the encoder, the synthesizer and the vocoder...") encoder.load_model(Path("rtvc/encoder/saved_models/pretrained.pt")) synthesizer = Synthesizer(Path("rtvc/synthesizer/saved_models/logs-pretrained/taco_pretrained")) vocoder.load_model(Path("rtvc/vocoder/saved_models/pretrained/pretrained.pt")) # Computing the embedding original_wav, sampling_rate = librosa.load(audio_path) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) log.info("Loaded file successfully") if os.path.exists(audio_path): os.remove(audio_path) embed = encoder.embed_utterance(preprocessed_wav) log.info("Created the embedding") specs = synthesizer.synthesize_spectrograms([sentence], [embed]) spec = np.concatenate(specs, axis=1) # spec = specs[0] log.info("Created the mel spectrogram") # Generating the waveform log.info("Synthesizing the waveform:") generated_wav = vocoder.infer_waveform(spec, progress_callback=lambda *args: None) # Post-generation # There's a bug with sounddevice that makes the audio cut one second earlier, so we # pad it. generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") # Save it on the disk fp = tempfile.TemporaryFile() librosa.output.write_wav(fp, generated_wav.astype(np.float32), synthesizer.sample_rate) return {"audio": fp.read()} except Exception as e: log.error(e) traceback.print_exc() return {"error": "Fail"}
global in_fpath, filenum, preprocessed_wav, embed, torch, vocoder filenum = 0 data_path = '/Users/glw001/Documents/Development/voice_clone/LibriSpeech/train-clean-100' in_fpath = Path(f'{data_path}/F1088-Christabel/134315/1088-134315-0002.flac') seed = 694201312 word_substitutions = {'do': 'doo', 'Do': 'Doo', 'NPC': 'En Pee See'} ## Load the models one by one. print("Preparing the encoder, the synthesizer and the vocoder...") encoder.load_model(encoder_path) synthesizer = Synthesizer(synthesizer_path.joinpath("taco_pretrained"), seed=seed) vocoder.load_model(vocoder_path) preprocessed_wav = encoder.preprocess_wav(in_fpath) original_wav, sampling_rate = librosa.load(str(in_fpath)) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) embed = encoder.embed_utterance(preprocessed_wav) torch.manual_seed(seed) vocoder.load_model(vocoder_path) def word_replace(text: str): text = " " + text + " " for word in word_substitutions: regex = f'\s({word})[\.|\s|\!|\?]' word_match = re.findall(regex, text)
def run_voice_cloning(): ## Model locations enc_model_fpath = Path("encoder/saved_models/pretrained.pt") syn_model_dir = Path("synthesizer/saved_models/logs-pretrained/") voc_model_fpath = Path("vocoder/saved_models/pretrained/pretrained.pt") ref_voice_path = request.json["voiceFile"] # filename like ojo3.wav messages = request.json["messages"] # array of strings low_mem = request.json[ "low_mem"] if "low_mem" in request.json else False # whether to use LowMem Mode # Base64 encode the parameters so that we can reference this job in later api calls dataToEncodeAsID = ','.join(messages) + ref_voice_path encodedBytes = base64.b64encode(dataToEncodeAsID.encode("utf-8")) req_id = str(encodedBytes, "utf-8") # Md5 Hash it so that it is a consistent length req_id = hashlib.md5(req_id.encode('utf-8')).hexdigest() # Clear destination folder of generated sound files output_path = "/output/%s/" % req_id if os.path.exists(output_path): shutil.rmtree(output_path) os.makedirs(output_path) ## Print some environment information (for debugging purposes) print("Running a test of your configuration...\n") if not torch.cuda.is_available(): print( "Your PyTorch installation is not configured to use CUDA. If you have a GPU ready " "for deep learning, ensure that the drivers are properly installed, and that your " "CUDA version matches your PyTorch installation. CPU-only inference is currently " "not supported.", file=sys.stderr) return abort(500) device_id = torch.cuda.current_device() gpu_properties = torch.cuda.get_device_properties(device_id) print( "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " "%.1fGb total memory.\n" % (torch.cuda.device_count(), device_id, gpu_properties.name, gpu_properties.major, gpu_properties.minor, gpu_properties.total_memory / 1e9)) ## Load the models one by one. print("Preparing the encoder, the synthesizer and the vocoder...") encoder.load_model(enc_model_fpath) synthesizer = Synthesizer(syn_model_dir.joinpath("taco_pretrained"), low_mem=low_mem) vocoder.load_model(voc_model_fpath) in_fpath = Path(ref_voice_path) print("Computing the embedding") ## Computing the embedding # First, we load the wav using the function that the speaker encoder provides. This is # important: there is preprocessing that must be applied. # The following two methods are equivalent: # - Directly load from the filepath: preprocessed_wav = encoder.preprocess_wav(in_fpath) # - If the wav is already loaded: original_wav, sampling_rate = librosa.load(in_fpath) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) print("Loaded file succesfully") # Then we derive the embedding. There are many functions and parameters that the # speaker encoder interfaces. These are mostly for in-depth research. You will typically # only use this function (with its default parameters): embed = encoder.embed_utterance(preprocessed_wav) print("Created the embedding") print("Generation loop") num_generated = 0 fpath = None for text in messages: try: ## Generating the spectrogram # The synthesizer works in batch, so you need to put your data in a list or numpy array texts = [text] embeds = [embed] # If you know what the attention layer alignments are, you can retrieve them here by # passing return_alignments=True specs = synthesizer.synthesize_spectrograms(texts, embeds) spec = specs[0] print("Created the mel spectrogram") ## Generating the waveform print("Synthesizing the waveform:") # Synthesizing the waveform is fairly straightforward. Remember that the longer the # spectrogram, the more time-efficient the vocoder. generated_wav = vocoder.infer_waveform(spec) ## Post-generation # There's a bug with sounddevice that makes the audio cut one second earlier, so we # pad it. generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") # Save it on the disk fpath = output_path + ("output_%03d.wav" % num_generated) print(generated_wav.dtype) librosa.output.write_wav(fpath, generated_wav.astype(np.float32), synthesizer.sample_rate) num_generated += 1 print("\nSaved output as %s\n\n" % fpath) except Exception as e: print("Caught exception: %s" % repr(e)) print("Restarting\n") return req_id
"not supported.", file=sys.stderr) quit(-1) device_id = torch.cuda.current_device() gpu_properties = torch.cuda.get_device_properties(device_id) print( "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " "%.1fGb total memory.\n" % (torch.cuda.device_count(), device_id, gpu_properties.name, gpu_properties.major, gpu_properties.minor, gpu_properties.total_memory / 1e9)) encoder.load_model(Path('encoder/saved_models/pretrained.pt')) synthesizer = Synthesizer( Path("synthesizer/saved_models/logs-pretrained/taco_pretrained")) vocoder.load_model(Path('vocoder/saved_models/pretrained/pretrained.pt')) voice = 'voices/peabody/voice.wav' try: preprocessed_wav = encoder.preprocess_wav(voice) embed = encoder.embed_utterance(preprocessed_wav) text = "Hello Carina. Hello Carina Hello Carina Hello Carina Hello Carina Hello Carina Hello Carina This is Kevin Smith, happy new translation around the Sun." texts = [text] embeds = [embed] specs = synthesizer.synthesize_spectrograms(texts, embeds) spec = specs[0]
def hello_world(): legoutput = upload_file() lig = "This is a demo utterance. This will work when you do not add any utterance." if request.method == 'POST': lig = request.form["textarea"] print(str(lig)) #return mainpage() if str(legoutput) == "None": return render_template("index.html", output="") else: from encoder.params_model import model_embedding_size as speaker_embedding_size from utils.argutils import print_args from synthesizer.inference import Synthesizer from encoder import inference as encoder from vocoder import inference as vocoder from pathlib import Path import numpy as np import librosa import argparse import torch try: parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("-e", "--enc_model_fpath", type=Path, default="encoder/saved_models/pretrained.pt") parser.add_argument( "-s", "--syn_model_dir", type=Path, default="synthesizer/saved_models/logs-pretrained/") parser.add_argument( "-v", "--voc_model_fpath", type=Path, default="vocoder/saved_models/pretrained/pretrained.pt") parser.add_argument("--low_mem", action="store_true") #parser.add_argument("--no_sound", action="store_true") args = parser.parse_args() print_args(args, parser) #if not args.no_sound: # import sounddevice as sd encoder.load_model(args.enc_model_fpath) synthesizer = Synthesizer( args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem) vocoder.load_model(args.voc_model_fpath) num_generated = 0 in_fpath = legoutput[1] print(str(in_fpath)) preprocessed_wav = encoder.preprocess_wav(in_fpath) original_wav, sampling_rate = librosa.load(in_fpath) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) embed = encoder.embed_utterance(preprocessed_wav) print("Created the embedding") text = str(lig) texts = [text] embeds = [embed] specs = synthesizer.synthesize_spectrograms(texts, embeds) spec = specs[0] print("Created the mel spectrogram") print("Synthesizing the waveform:") generated_wav = vocoder.infer_waveform(spec) generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") #if not args.no_sound: # sd.stop() # sd.play(generated_wav, synthesizer.sample_rate) fpath = "static/output.wav" print(generated_wav.dtype) librosa.output.write_wav(fpath, generated_wav.astype(np.float32), synthesizer.sample_rate) print("\nSaved output as %s\n\n" % fpath) return render_template("index.html", output=htmloader(text, legoutput[1], fpath)) except Exception as e: return render_template("index.html", output="Caught exception: %s" % repr(e))
import warnings app = flask.Flask(__name__) warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=FutureWarning) # Ok to hard code these locations encoder_model_path = '/opt/ml/model/encoder/saved_models/pretrained.pt' synthesizer_path = '/opt/ml/model/synthesizer/saved_models/logs-pretrained/taco_pretrained/' vocoder_model_path = '/opt/ml/model/vocoder/saved_models/pretrained/pretrained.pt' # Load the models encoder.load_model(Path(encoder_model_path)) synthesizer = Synthesizer(Path(synthesizer_path)) vocoder.load_model(Path(vocoder_model_path)) def clone_voice(sentence, results_file): """Adapted from 'demo_cli.py'""" u_path = Path('utterance.wav') results_path = Path(results_file) preprocessed_wav = encoder.preprocess_wav(u_path) embed = encoder.embed_utterance(preprocessed_wav) specs = synthesizer.synthesize_spectrograms([sentence], [embed]) generated_wav = vocoder.infer_waveform(specs[0]) generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") librosa.output.write_wav(results_path, generated_wav.astype(np.float32), synthesizer.sample_rate)
def test_config(self): ## Print some environment information (for debugging purposes) print("Running a test of your configuration...\n") try: if not torch.cuda.is_available(): print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready " "for deep learning, ensure that the drivers are properly installed, and that your " "CUDA version matches your PyTorch installation. CPU-only inference is currently " "not supported.", file=sys.stderr) quit(-1) device_id = torch.cuda.current_device() gpu_properties = torch.cuda.get_device_properties(device_id) print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " "%.1fGb total memory.\n" % (torch.cuda.device_count(), device_id, gpu_properties.name, gpu_properties.major, gpu_properties.minor, gpu_properties.total_memory / 1e9)) ## Load the models one by one. print("Preparing the encoder, the synthesizer and the vocoder...") encoder.load_model(self.enc_model_fpath) print("Loaded Encoder") self.synthesizer = Synthesizer(self.syn_model_dir.joinpath("taco_pretrained"), low_mem=self.low_mem) print("Loaded Synth") vocoder.load_model(self.voc_model_fpath) print("Loaded Vocoder") ## Run a test print("Testing your configuration with small inputs.") # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's # sampling rate, which may differ. # If you're unfamiliar with digital audio, know that it is encoded as an array of floats # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1. # The sampling rate is the number of values (samples) recorded per second, it is set to # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond # to an audio of 1 second. print("\tTesting the encoder...") encoder.embed_utterance(np.zeros(encoder.sampling_rate)) # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance # returns, but here we're going to make one ourselves just for the sake of showing that it's # possible. embed = np.random.rand(speaker_embedding_size) # Embeddings are L2-normalized (this isn't important here, but if you want to make your own # embeddings it will be). embed /= np.linalg.norm(embed) # The synthesizer can handle multiple inputs with batching. Let's create another embedding to # illustrate that embeds = [embed, np.zeros(speaker_embedding_size)] texts = ["test 1", "test 2"] print("\tTesting the synthesizer... (loading the model will output a lot of text)") mels = self.synthesizer.synthesize_spectrograms(texts, embeds) # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We # can concatenate the mel spectrograms to a single one. mel = np.concatenate(mels, axis=1) # The vocoder can take a callback function to display the generation. More on that later. For # now we'll simply hide it like this: no_action = lambda *args: None print("\tTesting the vocoder...") # For the sake of making this test short, we'll pass a short target length. The target length # is the length of the wav segments that are processed in parallel. E.g. for audio sampled # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and # that has a detrimental effect on the quality of the audio. The default parameters are # recommended in general. vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action) print("\tAll test passed!") return("All test passed!") except Exception as e: return("Caught exception: %s" % repr(e))
gpu_properties = torch.cuda.get_device_properties(device_id) print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " "%.1fGb total memory.\n" % (torch.cuda.device_count(), device_id, gpu_properties.name, gpu_properties.major, gpu_properties.minor, gpu_properties.total_memory / 1e9)) ## Load the models one by one. print("Preparing the encoder, the synthesizer and the vocoder...") encoder.load_model(args.enc_model_fpath) synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem) vocoder.load_model(args.voc_model_fpath) ## Run a test print("Testing your configuration with small inputs.") # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's # sampling rate, which may differ. # If you're unfamiliar with digital audio, know that it is encoded as an array of floats # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1. # The sampling rate is the number of values (samples) recorded per second, it is set to # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond # to an audio of 1 second. print("\tTesting the encoder...") encoder.embed_utterance(np.zeros(encoder.sampling_rate)) # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
def run_DeepTalk_demo(ref_audio_path='samples/ref_VCTKp240.wav', output_text='Hello World', enc_model_fpath=config.enc_model_fpath, enc_module_name=config.enc_module_name, syn_model_dir=config.syn_model_dir, voc_model_fpath=config.voc_model_fpath, key_embed=None): class hyperparameter: def __init__(self): self.enc_model_fpath = enc_model_fpath self.enc_module_name = enc_module_name self.syn_model_dir = syn_model_dir self.voc_model_fpath = voc_model_fpath self.enc_normalize = False self.voc_normalize = True self.low_mem = False # "If True, the memory used by the synthesizer will be freed after each use. Adds large " # "overhead but allows to save some GPU memory for lower-end GPUs." self.no_sound = False # If True, audio won't be played. self.sampling_rate = 16000 ## 16000: For mel-spectrogram based methods; 8000: For fCNN base methods self.ref_audio_path = ref_audio_path self.output_text = output_text args = hyperparameter() ## Load trained models: Encoder, Synthesizer, and Vocoder # os.environ["CUDA_VISIBLE_DEVICES"] = '0' encoder.load_model(args.enc_model_fpath, module_name=args.enc_module_name) synthesizer = Synthesizer(args.syn_model_dir, low_mem=args.low_mem) vocoder.load_model(args.voc_model_fpath) ## Encoding stage print('---------------------------------------------------------------') print('Stage 1/3: Encoder') print('---------------------------------------------------------------') wav = Synthesizer.load_preprocess_wav(args.ref_audio_path) ref_audio = encoder.preprocess_wav(wav) embed, partial_embeds, _ = encoder.embed_utterance(ref_audio, using_partials=True, return_partials=True, key_embed=key_embed) if (args.enc_normalize): embed = embed / np.linalg.norm(embed) if (embed.shape[0] == 128): embed = np.concatenate((embed, embed), axis=0) ## Synthesizing stage print('---------------------------------------------------------------') print('Stage 2/3: Synthesizer') print('---------------------------------------------------------------') texts = args.output_text # texts = re.split(',|.',texts) texts = re.split(r'[,.]\s*', texts) texts[:] = [x for x in texts if x] print(texts) # texts = texts.split("\n") # texts = texts.split(".") # texts = texts.split(",") embeds = np.stack([embed] * len(texts)) specs = synthesizer.synthesize_spectrograms(texts, embeds) breaks = [spec.shape[1] for spec in specs] synthesized_mel = np.concatenate(specs, axis=1) ## Vocoding stage print('---------------------------------------------------------------') print('Stage 3/3: Vocoder') print('---------------------------------------------------------------') no_action = lambda *args: None wav1 = vocoder.infer_waveform(synthesized_mel, progress_callback=no_action, normalize=args.voc_normalize) # Add breaks b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size) b_starts = np.concatenate(([0], b_ends[:-1])) wavs = [wav1[start:end] for start, end, in zip(b_starts, b_ends)] breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks) wav1 = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)]) synthesized_wav = wav1 / np.abs(wav1).max() * 0.97 return synthesized_wav, Synthesizer.sample_rate, embed
def voice_cloning(audio_file, text, enc_model_fpath, syn_model_dir, voc_model_fpath, low_mem): ## Print some environment information (for debugging purposes) print("Running a test of your configuration...\n") if not torch.cuda.is_available(): print( "Your PyTorch installation is not configured to use CUDA. If you have a GPU ready " "for deep learning, ensure that the drivers are properly installed, and that your " "CUDA version matches your PyTorch installation. CPU-only inference is currently " "not supported.", file=sys.stderr) quit(-1) device_id = torch.cuda.current_device() gpu_properties = torch.cuda.get_device_properties(device_id) print( "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " "%.1fGb total memory.\n" % (torch.cuda.device_count(), device_id, gpu_properties.name, gpu_properties.major, gpu_properties.minor, gpu_properties.total_memory / 1e9)) ## Load the models one by one. print("Preparing the encoder, the synthesizer and the vocoder...") encoder.load_model(enc_model_fpath) synthesizer = Synthesizer(syn_model_dir.joinpath("taco_pretrained"), low_mem=low_mem) vocoder.load_model(voc_model_fpath) ## Run a test print("Testing your configuration with small inputs.") # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's # sampling rate, which may differ. # If you're unfamiliar with digital audio, know that it is encoded as an array of floats # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1. # The sampling rate is the number of values (samples) recorded per second, it is set to # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond # to an audio of 1 second. print("\tTesting the encoder...") encoder.embed_utterance(np.zeros(encoder.sampling_rate)) # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance # returns, but here we're going to make one ourselves just for the sake of showing that it's # possible. embed = np.random.rand(speaker_embedding_size) # Embeddings are L2-normalized (this isn't important here, but if you want to make your own # embeddings it will be). embed /= np.linalg.norm(embed) # The synthesizer can handle multiple inputs with batching. Let's create another embedding to # illustrate that embeds = [embed, np.zeros(speaker_embedding_size)] texts = ["test 1", "test 2"] print( "\tTesting the synthesizer... (loading the model will output a lot of text)" ) mels = synthesizer.synthesize_spectrograms(texts, embeds) # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We # can concatenate the mel spectrograms to a single one. mel = np.concatenate(mels, axis=1) # The vocoder can take a callback function to display the generation. More on that later. For # now we'll simply hide it like this: no_action = lambda *args: None print("\tTesting the vocoder...") # For the sake of making this test short, we'll pass a short target length. The target length # is the length of the wav segments that are processed in parallel. E.g. for audio sampled # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and # that has a detrimental effect on the quality of the audio. The default parameters are # recommended in general. vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action) print("All test passed! You can now synthesize speech.\n\n") ## Interactive speech generation print( "This is a GUI-less example of interface to SV2TTS. The purpose of this script is to " "show how you can interface this project easily with your own. See the source code for " "an explanation of what is happening.\n") print("Interactive generation loop") num_generated = 0 while True: try: # Get the reference audio filepath # message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \ # "wav, m4a, flac, ...):\n" # in_fpath = Path(input(message).replace("\"", "").replace("\'", "")) in_fpath = Path(audio_file.replace("\"", "").replace("\'", "")) ## Computing the embedding # First, we load the wav using the function that the speaker encoder provides. This is # important: there is preprocessing that must be applied. # The following two methods are equivalent: # - Directly load from the filepath: preprocessed_wav = encoder.preprocess_wav(in_fpath) # - If the wav is already loaded: original_wav, sampling_rate = librosa.load(in_fpath) preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate) print("Loaded file succesfully") # Then we derive the embedding. There are many functions and parameters that the # speaker encoder interfaces. These are mostly for in-depth research. You will typically # only use this function (with its default parameters): embed = encoder.embed_utterance(preprocessed_wav) print("Created the embedding") ## Generating the spectrogram # text = input("Write a sentence (+-20 words) to be synthesized:\n") print('\n\nThe text to convert to speech: ', text) text = text # The synthesizer works in batch, so you need to put your data in a list or numpy array texts = [text] embeds = [embed] # If you know what the attention layer alignments are, you can retrieve them here by # passing return_alignments=True specs = synthesizer.synthesize_spectrograms(texts, embeds) spec = specs[0] print("Created the mel spectrogram") ## Generating the waveform print("Synthesizing the waveform:") # Synthesizing the waveform is fairly straightforward. Remember that the longer the # spectrogram, the more time-efficient the vocoder. generated_wav = vocoder.infer_waveform(spec) ## Post-generation # There's a bug with sounddevice that makes the audio cut one second earlier, so we # pad it. generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant") # # Play the audio (non-blocking) # if not args.no_sound: # sd.stop() # sd.play(generated_wav, synthesizer.sample_rate) # Save it on the disk fpath = "demo_output_%02d.wav" % num_generated print(generated_wav.dtype) librosa.output.write_wav(fpath, generated_wav.astype(np.float32), synthesizer.sample_rate) num_generated += 1 print("\nSaved output as %s\n\n" % fpath) except Exception as e: print("Caught exception: %s" % repr(e)) print("Restarting\n")
import numpy as np from pathlib import Path from scipy.io import wavfile from synthesizer.inference import Synthesizer from encoder import inference as encoder from vocoder import inference as vocoder SAMPLE_RATE = 22050 embedding = None # loading Models encoder.load_model(BASE_PATH_VOICE_CLONE / Path("encoder/saved_models/pretrained.pt")) synthesizer = Synthesizer(BASE_PATH_VOICE_CLONE / Path("synthesizer/saved_models/logs-pretrained/taco_pretrained")) vocoder.load_model(BASE_PATH_VOICE_CLONE / Path("vocoder/saved_models/pretrained/pretrained.pt")) print("All models Load Sucessfully") # In[4]: import librosa def _compute_embedding(audio): ''' Description Loading Embedding from the audio file to clone Input: audio: Audio File