示例#1
0
def DeepTalk_encoder(file_path,
                     model_save_path,
                     module_name,
                     preprocess=True,
                     normalize=True,
                     sampling_rate=8000,
                     duration=None):

    encoder.load_model(model_save_path, module_name=module_name)

    if (preprocess):
        wav = Synthesizer.load_preprocess_wav(file_path)
        ref_audio = encoder.preprocess_wav(wav)
    else:
        ref_audio, sr = librosa.load(file_path, sr=sampling_rate)

    if (duration is not None):
        ref_audio = ref_audio[0:int(duration * sampling_rate)]

    embed, partial_embeds, _ = encoder.embed_utterance(ref_audio,
                                                       using_partials=True,
                                                       return_partials=True)

    if (normalize):
        embed = embed / np.linalg.norm(embed)

    return embed
示例#2
0
def synth(text, audio_file):
    """

    Parameters
    ----------
    text : string
        text to be said in synthesized voice
    audio_file : filepath
        filepath for audio file in wav format

    Returns
    -------
    generated_wav : numpy.ndarray
        Numpy padded array of synthesized audio signal

    """

    in_fpath = Path("audio.wav")
    original_wav, sampling_rate = librosa.load(in_fpath)
    preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
    embed = encoder.embed_utterance(preprocessed_wav)
    print("Synthesizing new audio...")
    specs = synthesizer.synthesize_spectrograms([text], [embed])
    generated_wav = vocoder.infer_waveform(specs[0])
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                           mode="constant")

    return generated_wav
def change_mode(character: str = "Human_Man", tone: str = "neutral"):

    training_dir = voices_dict[character]['ID']
    tone_file = voices_dict[character]['tone'][tone] + '.flac'
    tone_dir = tone_file.split("-")[1]
    local_infpath = Path(f'{data_path}/{training_dir}/{tone_dir}/{tone_file}')

    global in_fpath, filenum, preprocessed_wav, embed, torch, vocoder
    if local_infpath != in_fpath and character is not None:
        if tone is None:
            tone = "neutral"

        print(
            f'Reference sound has changed; now loading {character}:{tone}...')
        with nostdout():
            in_fpath = local_infpath

            preprocessed_wav = encoder.preprocess_wav(in_fpath)
            original_wav, sampling_rate = librosa.load(str(in_fpath))
            preprocessed_wav = encoder.preprocess_wav(original_wav,
                                                      sampling_rate)

            embed = encoder.embed_utterance(preprocessed_wav)
            torch.manual_seed(seed)
            vocoder.load_model(vocoder_path)
            text_to_speech('Tea.', play_sound=False)
    else:
        print('Mode is already correct. No need to change.')
示例#4
0
    def __init__(self):
        if (Text2SpeechModel == "dc_tts"):
            self.g = Graph(mode="synthesize")
            print("Text2Speech Tensorflow Graph loaded")
        elif (Text2SpeechModel == "RTVC"):
            enc_model_fpath = os.path.join(
                root_file_path, "RTVC", "encoder/saved_models/pretrained.pt")
            syn_model_dir = os.path.join(
                root_file_path, "RTVC",
                "synthesizer/saved_models/logs-pretrained")
            voc_model_fpath = os.path.join(
                root_file_path, "RTVC",
                "vocoder/saved_models/pretrained/pretrained.pt")
            encoder.load_model(enc_model_fpath)
            self.synthesizer = Synthesizer(os.path.join(
                syn_model_dir, "taco_pretrained"),
                                           low_mem=False)
            vocoder.load_model(voc_model_fpath)
            in_fpath = os.path.join("/",
                                    *root_file_path.split("/")[:-1],
                                    "REF/refaudioRTVC/ref.wav")
            preprocessed_wav = encoder.preprocess_wav(in_fpath)
            original_wav, sampling_rate = librosa.load(in_fpath)
            preprocessed_wav = encoder.preprocess_wav(original_wav,
                                                      sampling_rate)
            embed = encoder.embed_utterance(preprocessed_wav)
            self.embeds = [embed]
        elif (Text2SpeechModel == "AudioSynth"):
            taco_pretrained_config_path = os.path.join(
                root_file_path,
                'AudioSynth/TensorFlowTTS/examples/tacotron2/conf/tacotron2.v1.yaml'
            )
            tacotron2_config = AutoConfig.from_pretrained(
                taco_pretrained_config_path)
            taco_path = os.path.join(root_file_path,
                                     "AudioSynth/tacotron2-120k.h5")
            self.tacotron2 = TFAutoModel.from_pretrained(
                config=tacotron2_config,
                pretrained_path=taco_path,
                training=False,
                name="tacotron2")

            melgan_stft_pretrained_config_path = os.path.join(
                root_file_path,
                'AudioSynth/TensorFlowTTS/examples/melgan.stft/conf/melgan.stft.v1.yaml'
            )
            melgan_stft_config = AutoConfig.from_pretrained(
                melgan_stft_pretrained_config_path)
            melgan_stft_path = os.path.join(root_file_path,
                                            "AudioSynth/melgan.stft-2M.h5")
            self.melgan_stft = TFAutoModel.from_pretrained(
                config=melgan_stft_config,
                pretrained_path=melgan_stft_path,
                name="melgan_stft")
            self.processor = AutoProcessor.from_pretrained(
                pretrained_path=os.path.join(
                    root_file_path, "AudioSynth/ljspeech_mapper.json"))
            mels, alignment_history, audios = do_synthesis(
                "Hello, how can I help you today?", self.tacotron2,
                self.melgan_stft, "TACOTRON", "MELGAN-STFT", self.processor)
    def preprocess_embeddings(self, path, ext_audio, ext_embed):
        for i in range(1, len(self._walker)):
            fileid = self._walker[i]

            speaker_id, chapter_id, utterance_id = fileid.split("-")

            fileid_audio = speaker_id + "-" + chapter_id + "-" + utterance_id
            file_audio = fileid_audio + ext_audio
            file_audio = os.path.join(path, speaker_id, chapter_id, file_audio)
            file_embed = fileid_audio + ext_embed
            file_embed = os.path.join(path, speaker_id, chapter_id, file_embed)

            # Load audio
            waveform, sample_rate = torchaudio.load(file_audio)

            print("Loaded file: ", fileid)

            # Calculate speaker embedding
            wav = waveform.transpose(0, 1).detach().numpy().squeeze()
            preprocessed_wav = styleEncoder.preprocess_wav(wav, sample_rate)
            embedding = styleEncoder.embed_utterance(preprocessed_wav)

            # Save embeddings to corresponding csv files
            data = asarray(embedding)
            savetxt(file_embed, data, delimiter=',')

            print("Saved embedding: ", file_embed)
示例#6
0
    def generate_voice(self, in_fpath, text, out_fpath):
        try:
            preprocessed_wav = encoder.preprocess_wav(in_fpath)
            original_wav, sampling_rate = librosa.load(in_fpath)
            preprocessed_wav = encoder.preprocess_wav(original_wav,
                                                      sampling_rate)
            print("Loaded file successfully")

            embed = encoder.embed_utterance(preprocessed_wav)
            print("Created the embedding")

            texts = [text]
            embeds = [embed]
            # If you know what the attention layer alignments are, you can retrieve them here by
            # passing return_alignments=True
            specs = self.synthesizer.synthesize_spectrograms(texts, embeds)
            spec = specs[0]
            print("Created the mel spectrogram")
            ## Generating the waveform
            print("Synthesizing the waveform:")
            generated_wav = vocoder.infer_waveform(spec)

            generated_wav = np.pad(generated_wav,
                                   (0, self.synthesizer.sample_rate),
                                   mode="constant")
            librosa.output.write_wav(out_fpath,
                                     generated_wav.astype(np.float32),
                                     self.synthesizer.sample_rate)
            print("\nSaved output as %s\n\n" % out_fpath)

        except Exception as e:
            print("Caught exception: %s" % repr(e))
            print("Restarting\n")
示例#7
0
    def vocode(self):
        speaker_name, spec, breaks, _ = self.current_generated
        assert spec is not None

        # Synthesize the waveform
        if not vocoder.is_loaded():
            self.init_vocoder()

        def vocoder_progress(i, seq_len, b_size, gen_rate):
            real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
            line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
                   % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
            self.ui.log(line, "overwrite")
            self.ui.set_loading(i, seq_len)

        if self.ui.current_vocoder_fpath is not None:
            self.ui.log("")
            wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress)
        else:
            self.ui.log("Waveform generation with Griffin-Lim... ")
            wav = Synthesizer.griffin_lim(spec)
        self.ui.set_loading(0)
        self.ui.log(" Done!", "append")

        # Add breaks
        b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
        b_starts = np.concatenate(([0], b_ends[:-1]))
        wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
        breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
        wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])

        # Play it
        wav = wav / np.abs(wav).max() * 0.97
        self.ui.play(wav, Synthesizer.sample_rate)

        fref = '-'.join([self.ui.current_dataset_name, self.ui.current_speaker_name, self.ui.current_utterance_name])
        ftime = '{}'.format(int(time.time()))
        ftext = self.ui.text_prompt.toPlainText()
        fms = int(len(wav) * 1000 / Synthesizer.sample_rate)
        fname = filename_formatter('{}_{}_{}ms_{}.wav'.format(fref, ftime, fms, ftext))
        audio.save_wav(wav, _out_wav_dir.joinpath(fname), Synthesizer.sample_rate)  # save

        # Compute the embedding
        # TODO: this is problematic with different sampling rates, gotta fix it
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)

        # Add the utterance
        name = speaker_name + "_gen_%05d" % int(time.time())
        utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True)

        np.save(_out_embed_dir.joinpath(name + '.npy'), embed, allow_pickle=False)  # save

        self.utterances.add(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "generated")
        self.ui.draw_umap_projections(self.utterances)
示例#8
0
    def predict(self, path):
        # fpath = '/home/ali/Desktop/a2lsv/deneme/'
        fpaths = glob(path+"/*.wav")
        embedings = []
        embedingsDict = {}
        for fpath in fpaths:
            wav = librosa.load(fpath, 16000)[0]
            encoder_wav = encoder.preprocess_wav(wav)
            embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
            embed = np.array(embed).reshape(-1)
            embedings.append(embed)
            embedingsDict[fpath.split("/")[-1].split(".wav")[0]] = embed

        pickle.dump(embedingsDict, open(path+"/embedingsDict.pickle", 'wb'))

        # reducer = TSNE()
        reducer = umap.UMAP(int(np.ceil(np.sqrt(len(embedings)))), metric="cosine")
        projections = reducer.fit_transform(embedings)
        
        thresh = 1
        clusters = hcluster.fclusterdata(projections, thresh, criterion="distance")

        speakerSlices = {}
        for fpath, speaker  in zip(fpaths, clusters):
            speaker = str(speaker)
            audioId = fpath.split('/')[-1].split('.')[0]
            if speaker not in speakerSlices.keys():
                speakerSlices[speaker] = [int(audioId)]
            else:
                speakerSlices[speaker] += [int(audioId)]
        for k, v in speakerSlices.items():
            v.sort()
        return speakerSlices
示例#9
0
    def add_real_utterance(self, wav, name, speaker_name):
        # Compute the mel spectrogram
        spec = Synthesizer.make_spectrogram(wav)
        self.ui.draw_spec(spec, "current")

        # Compute the embedding
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(
            encoder_wav, return_partials=True)

        np.save(self._out_embed_dir.joinpath(name + '.npy'),
                embed,
                allow_pickle=False)  # save

        # Add the utterance
        utterance = Utterance(name, speaker_name, wav, spec, embed,
                              partial_embeds, False)
        self.utterances.add(utterance)
        self.ui.register_utterance(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "current")
        self.ui.draw_umap_projections(self.utterances)
示例#10
0
    def vocode(self):
        speaker_name, spec, breaks, _ = self.current_generated
        assert spec is not None

        # Synthesize the waveform
        if not vocoder.is_loaded():
            self.init_vocoder()

        def vocoder_progress(i, seq_len, b_size, gen_rate):
            real_time_factor = (gen_rate / Synthesizer.sample_rate) * 1000
            line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \
                   % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor)
            self.ui.log(line, "overwrite")
            self.ui.set_loading(i, seq_len)

        if self.ui.current_vocoder_fpath is not None:
            self.ui.log("")
            wav = vocoder.infer_waveform(spec,
                                         progress_callback=vocoder_progress)
        else:
            self.ui.log("Waveform generation with Griffin-Lim... ")
            wav = Synthesizer.griffin_lim(spec)
        self.ui.set_loading(0)
        self.ui.log(" Done!", "append")

        # Add breaks
        b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
        b_starts = np.concatenate(([0], b_ends[:-1]))
        wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
        breaks = [np.zeros(int(0.15 * Synthesizer.sample_rate))] * len(breaks)
        wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])

        # Play it
        wav = wav / np.abs(wav).max() * 0.97
        self.ui.play(wav, Synthesizer.sample_rate)

        self.ui.save_button.setDisabled(False)

        # Compute the embedding
        # TODO: this is problematic with different sampling rates, gotta fix it
        if not encoder.is_loaded():
            self.init_encoder()
        encoder_wav = encoder.preprocess_wav(wav)
        embed, partial_embeds, _ = encoder.embed_utterance(
            encoder_wav, return_partials=True)

        # Add the utterance
        if not speaker_name is None:
            name = speaker_name
        else:
            name = "unknown"
        name = name + "_gen_%05d" % np.random.randint(100000)
        utterance = Utterance(name, speaker_name, wav, spec, embed,
                              partial_embeds, True)
        self.utterances.add(utterance)
        self.ui.register_utterance(utterance)

        # Plot it
        self.ui.draw_embed(embed, name, "generated")
        self.ui.draw_umap_projections(self.utterances)
示例#11
0
def load_model(in_fpath, parser):

	parser.add_argument("-e", "--enc_model_fpath", type=Path, 
		        default="encoder/saved_models/pretrained.pt",
		        help="Path to a saved encoder")
	parser.add_argument("-s", "--syn_model_dir", type=Path, 
		        default="synthesizer/saved_models/logs-pretrained/",
		        help="Directory containing the synthesizer model")
	parser.add_argument("-v", "--voc_model_fpath", type=Path, 
		        default="vocoder/saved_models/pretrained/pretrained.pt",
		        help="Path to a saved vocoder")
	parser.add_argument("--low_mem", action="store_true", help=\
	"If True, the memory used by the synthesizer will be freed after each use. Adds large "
	"overhead but allows to save some GPU memory for lower-end GPUs.")
	parser.add_argument("--no_sound", action="store_true", help=\
	"If True, audio won't be played.")
	args = parser.parse_args()
	encoder.load_model(args.enc_model_fpath)
	synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem)
	vocoder.load_model(args.voc_model_fpath)

	preprocessed_wav = encoder.preprocess_wav(in_fpath)
	original_wav, sampling_rate = librosa.load(in_fpath)
	preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
	embed = encoder.embed_utterance(preprocessed_wav)
	
	return synthesizer, sampling_rate, embed
示例#12
0
    def initialize(self):
        print("Running a test of your configuration...\n")
        if not torch.cuda.is_available():
            print(
                "Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
                "for deep learning, ensure that the drivers are properly installed, and that your "
                "CUDA version matches your PyTorch installation. CPU-only inference is currently "
                "not supported.")
            quit(-1)
        print("PyTorch is available and working...")
        device_id = torch.cuda.current_device()
        gpu_properties = torch.cuda.get_device_properties(device_id)
        print(
            "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
            "%.1fGb total memory.\n" %
            (torch.cuda.device_count(), device_id, gpu_properties.name,
             gpu_properties.major, gpu_properties.minor,
             gpu_properties.total_memory / 1e9))
        ## Load the models one by one.

        print("Preparing the encoder, the synthesizer and the vocoder...")
        encoder.load_model(self.enc_model_fpath)

        vocoder.load_model(self.voc_model_fpath)

        ## Run a test
        print("Testing your configuration with small inputs.")
        print("\tTesting the encoder...")
        encoder.embed_utterance(np.zeros(encoder.sampling_rate))

        embed = np.random.rand(speaker_embedding_size)
        embed /= np.linalg.norm(embed)
        embeds = [embed, np.zeros(speaker_embedding_size)]
        texts = ["test 1", "test 2"]
        print(
            "\tTesting the synthesizer... (loading the model will output a lot of text)"
        )
        mels = self.synthesizer.synthesize_spectrograms(texts, embeds)

        mel = np.concatenate(mels, axis=1)
        no_action = lambda *args: None
        print("\tTesting the vocoder...")
        vocoder.infer_waveform(mel,
                               target=200,
                               overlap=50,
                               progress_callback=no_action)
        print("All test passed! You can now synthesize speech.\n\n")
示例#13
0
def transform_embed(wav, encoder_model_fpath=Path()):
    from encoder import inference as encoder
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    wav = encoder.preprocess_wav(wav)
    embed = encoder.embed_utterance(wav)
    return embed
示例#14
0
def embed_utterance(fpaths, encoder_model_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    # Compute the speaker embedding of the utterance
    wav_fpath = fpaths
    wav, rate = librosa.load(wav_fpath)
    wav = encoder.preprocess_wav(wav, rate)
    return encoder.embed_utterance(wav)
    def extract_utterance_feats_spkr(self, data_utterance_path, is_full_ppg=False):
        """Get PPG and Mel (+ optional F0) for an utterance.

        Args:
            data_utterance_path: The path to the data utterance protocol buffer.
            is_full_ppg: If True, will use the full PPGs.

        Returns:
            feat_pairs: A list, each is a [pps, mel, dvec(spkr embedding)] pair.
        """
        utt = Utterance()
        fs, wav = wavfile.read(data_utterance_path)
        utt.fs = fs
        utt.wav = wav
        utt.ppg = get_ppg(data_utterance_path, self.ppg_deps)

        audio = torch.FloatTensor(utt.wav.astype(np.float32))
        fs = utt.fs

        if fs != self.stft.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                fs, self.stft.sampling_rate))
        audio_norm = audio / self.max_wav_value
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        # (1, n_mel_channels, T)
        acoustic_feats = self.stft.mel_spectrogram(audio_norm)
        # (n_mel_channels, T)
        acoustic_feats = torch.squeeze(acoustic_feats, 0)
        # (T, n_mel_channels)
        acoustic_feats = acoustic_feats.transpose(0, 1)
        
        #print("encoder model path", self.encoder_model_fpath)
        
        from encoder import inference as encoder
        if not encoder.is_loaded():
            encoder.load_model(self.encoder_model_fpath)
        
        #wav = np.load(data_utterance_path)
        wav = encoder.preprocess_wav(data_utterance_path) # wav
        embed = encoder.embed_utterance(wav)
        #print("spkr embedding", embed)
        #print("shape of ppg, acoustic feats and spkr embedding", (utt.ppg).shape, acoustic_feats.shape, embed.shape)
        
        if is_full_ppg:
            if self.is_append_f0:
                ppg_f0 = append_ppg(utt.ppg, utt.f0)
                return [ppg_f0, acoustic_feats, embed]
            else:
                return [utt.ppg, acoustic_feats, embed]
        else:
            if self.is_append_f0:
                ppg_f0 = append_ppg(utt.monophone_ppg, utt.f0)
                return [ppg_f0, acoustic_feats, embed]
            else:
                return [utt.monophone_ppg, acoustic_feats, embed]
示例#16
0
def embed_utterance(fpaths, encoder_model_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    # Compute the speaker embedding of the utterance
    wav_fpath, embed_fpath = fpaths
    wav = np.load(wav_fpath)
    wav = encoder.preprocess_wav(wav)
    embed = encoder.embed_utterance(wav)
    np.save(embed_fpath, embed, allow_pickle=False)
示例#17
0
    def get_embed(self, wav):
        # from encoder import inference as encoder
        if not encoder.is_loaded():
            encoder.load_model(self.encoder_model_fpath, device='cpu')
            # 用cpu避免以下报错。
            # "RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the ‘spawn’ start method"

        wav = encoder.preprocess_wav(wav)
        embed = encoder.embed_utterance(wav)
        return embed
示例#18
0
def get_spk_embed(load_path, enc_model_fpath):

    file_name = load_path.split('/')[-1]
    wav = load_wav(load_path)
    encoder.load_model(enc_model_fpath)
    preprocessed_wav = encoder.preprocess_wav(load_path)
    embed = encoder.embed_utterance(preprocessed_wav)
    spk_embd = torch.tensor(embed).unsqueeze(0)

    return spk_embd, file_name
示例#19
0
def embed_utterance(fpaths, encoder_model_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    # Compute the speaker embedding of the utterance
    wav_fpath = embed_fpath = fpaths
    embed_fpath = embed_fpath.replace(".wav", ".npy")
    wav, rate = librosa.load(wav_fpath)
    wav = encoder.preprocess_wav(wav, rate)
    embed = encoder.embed_utterance(wav)
    np.save(embed_fpath, embed, allow_pickle=False)
示例#20
0
def run_voiceCloning(filename):
    in_fpath = dataPath + "/" + filename

    #transforming mp3 into wav
    subprocess.call(['ffmpeg', '-i', in_fpath + '.mp3', in_fpath + '.wav'])
    time.sleep(5)
    #running the encoder on the audio input
    original_wav, sampling_rate = librosa.load(Path(in_fpath + '.wav'))
    preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
    #getting the embeds from the encoder
    embed = encoder.embed_utterance(preprocessed_wav)

    return audioFromEmbeds(filename, embed)
示例#21
0
def embed_utterance(src, skip_existing=True, encoder_model_fpath=Path()):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    wav_fpath, embed_fpath = src

    if skip_existing and embed_fpath.is_file():
        return

    wav = aukit.load_wav(wav_fpath, sr=hp.sampling_rate)
    wav = encoder.preprocess_wav(wav)
    embed = encoder.embed_utterance(wav)
    np.save(embed_fpath, embed, allow_pickle=False)
def clone_voice(sentence, results_file):
    """Adapted from 'demo_cli.py'"""
    u_path = Path('utterance.wav')
    results_path = Path(results_file)
    
    preprocessed_wav = encoder.preprocess_wav(u_path)
    embed = encoder.embed_utterance(preprocessed_wav)
    specs = synthesizer.synthesize_spectrograms([sentence], [embed])
    generated_wav = vocoder.infer_waveform(specs[0])
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
    
    librosa.output.write_wav(results_path, generated_wav.astype(np.float32), 
                             synthesizer.sample_rate)
示例#23
0
def embed_utterance(fpaths, encoder_model_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    # Compute the speaker embedding of the utterance
    wav_fpath, embed_fpath, _ = fpaths
    # try:
    #     wav = np.load(wav_fpath)
    # except ValueError as e:
    #     print(e)
    #     wav = np.load(wav_fpath, allow_pickle=True)
    wav = encoder.preprocess_wav(wav_fpath)
    embed = encoder.embed_utterance(wav)
    np.save(embed_fpath, embed, allow_pickle=False)
示例#24
0
    def gen_audio(self, ref_audio, text):
        try:
            in_fpath = Path(ref_audio.replace("\"", "").replace("\'", ""))

            ## Computing the embedding
            # First, we load the wav using the function that the speaker encoder provides. This is
            # important: there is preprocessing that must be applied.

            # The following two methods are equivalent:
            # - Directly load from the filepath:
            # preprocessed_wav = encoder.preprocess_wav(in_fpath)
            # - If the wav is already loaded:
            original_wav, sampling_rate = librosa.load(in_fpath, sr=None)
            preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
            print("Loaded file succesfully, rate=%s" % sampling_rate)

            # Then we derive the embedding. There are many functions and parameters that the
            # speaker encoder interfaces. These are mostly for in-depth research. You will typically
            # only use this function (with its default parameters):
            embed = encoder.embed_utterance(preprocessed_wav)
            print("Created the embedding")

            ## Generating the spectrogram
            # The synthesizer works in batch, so you need to put your data in a list or numpy array
            texts = [text]
            embeds = [embed]
            # If you know what the attention layer alignments are, you can retrieve them here by
            # passing return_alignments=True
            specs = self.synthesizer.synthesize_spectrograms(texts, embeds)
            spec = specs[0]
            print("Created the mel spectrogram")

            ## Generating the waveform
            print("Synthesizing the waveform:")
            # Synthesizing the waveform is fairly straightforward. Remember that the longer the
            # spectrogram, the more time-efficient the vocoder.
            generated_wav = vocoder.infer_waveform(spec)

            ## Post-generation
            # There's a bug with sounddevice that makes the audio cut one second earlier, so we
            # pad it.
            # generated_wav = np.pad(generated_wav, (0, self.synthesizer.sample_rate), mode="constant")
            print("\n samples = %s @ %s" % (len(generated_wav), self.synthesizer.sample_rate))

            return generated_wav

        except Exception as e:
            traceback.print_exc()
            print("Caught exception: %s" % repr(e))
            print("Restarting\n")
def create_embedding(path):
    if EMBEDDING_TYPE == "ge2e":
        in_fpath = Path(path.replace("\"", "").replace("\'", ""))
        preprocessed_wav = encoder.preprocess_wav(in_fpath)
        return encoder.embed_utterance(preprocessed_wav)
    else:
        # in_fpath = Path(path.replace("\"", "").replace("\'", ""))
        # preprocessed_wav = encoder.preprocess_wav(in_fpath)
        samplerate, data = read(path.replace("\"", "").replace("\'", ""))
        if samplerate != 16000:
            sys.exit(f"{path} does not have sample rate of 16000")
        emb = module(samples=np.array(data, dtype=float),
                     sample_rate=16000)['embedding']
        emb.shape.assert_is_compatible_with([None, 2048])
        return emb.numpy()
def signup(wav_fpath: Path, username, encoder_model_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    # Compute the speaker embedding of the utterance
    embed_fpath = signup_dir.joinpath(username + ".npy")
    wav = encoder.preprocess_wav(str(wav_fpath))
    embed = encoder.embed_utterance(wav)
    if os.path.exists(embed_fpath):
        old_embed = np.load(embed_fpath)
        embed = old_embed + embed
        embed /= np.linalg.norm(embed, 2)
        os.remove(embed_fpath)
    np.save(embed_fpath, embed, allow_pickle=False)
    print(username + " signed up.")
示例#27
0
def transform_embed(wav, encoder_model_fpath=Path()):
    # from encoder import inference as encoder
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    wav_ = encoder.preprocess_wav(wav)
    # Take segment
    segment_length = 2 * encoder.sampling_rate  # 随机选取2秒语音生成语音表示向量
    if len(wav_) > segment_length:
        max_audio_start = len(wav_) - segment_length
        audio_start = random.randint(0, max_audio_start)
        wav_ = wav_[audio_start:audio_start + segment_length]

    embed = encoder.embed_utterance(wav_)
    return embed
def _compute_embedding(audio):
    '''
    Description 
        Loading Embedding from the audio file to clone
        
    Input:
        audio: Audio File 
        
    Output
        Embeddings
    
    '''
    global embedding
    embedding = None
    embedding = encoder.embed_utterance(encoder.preprocess_wav(audio, SAMPLE_RATE))
def tts(input_dict):
    '''
    Flow:
    0) Check if audio has embeddings (Not yet)
    1) Encode the audio
    2) Synthesizer the text with embeddings
    3) Vocoder out the fake wav
    '''
    # init
    output_dict = {"data": {}}

    # loop the input
    for audio_name, raw_audio in input_dict["data"].items():
        wav_name_no_ext = Path(audio_name).stem
        saved_path_obj = Path.cwd() / "data/output"

        # step 1
        print("Step 1")
        raw_audio_np, sample_rate = librosa.load(io.BytesIO(raw_audio))
        preprocessed_wav = encoder.preprocess_wav(raw_audio_np, sample_rate)
        embeddings = encoder.embed_utterance(preprocessed_wav)

        # step 2
        print("Step 2")
        splitted_text = input_dict["text"].split(".")
        clean_text_list = [text for text in splitted_text if len(text) > 0]
        if len(clean_text_list) == 0:
            raise Exception("Empty text field")
        sentence_count = len(clean_text_list)
        embeddings_list = [embeddings] * sentence_count
        specs = synthesizer.synthesize_spectrograms(clean_text_list,
                                                    embeddings_list)

        # step 3
        print("Step 3")
        for index, spec in enumerate(specs):
            generated_wav = vocoder.infer_waveform(spec)
            # needed to 1 second for playback capability
            generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate),
                                   mode="constant")

            file_name = "{}_tts_{}.wav".format(wav_name_no_ext, index)
            file_path = saved_path_obj / file_name
            sf.write(str(file_path), generated_wav.astype(np.float32),
                     synthesizer.sample_rate, 'PCM_16')
            output_dict["data"][index] = file_path

    return output_dict
示例#30
0
def embed_utterance(fpaths, encoder_model_fpath, hparams):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    # Compute the speaker embedding of the utterance
    wav_fpath, embed_fpath = fpaths
    if embed_fpath.exists():
        return
    # wav = np.load(wav_fpath)
    wav, _ = librosa.load(wav_fpath, hparams.sample_rate)
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    wav = encoder.preprocess_wav(wav)
    embed = encoder.embed_utterance(wav)
    np.save(embed_fpath, embed, allow_pickle=False)