예제 #1
0
def multi_parse(output_dir, encoder_path, img_list):
    saved_img_f_list = [
        output_dir / img.stem for img in img_list
        if not os.path.exists(str(output_dir / img.stem) + '.npy')
    ]
    if len(saved_img_f_list) == 0:
        return

    # https://stackoverflow.com/questions/50412477/python-multiprocessing-grab-free-gpu
    if torch.cuda.is_available():
        cpu_name = multiprocessing.current_process().name
        cpu_id = int(cpu_name[cpu_name.find('-') + 1:])
        gpu_id = cpu_id % torch.cuda.device_count()
        device = torch.device(
            "cuda:{}".format(gpu_id) if torch.cuda.is_available() else "cpu")
    else:
        device = torch.device("cpu")

    encoder.load_model(encoder_path, multi_gpu=False, device=device)
    input_imgs = np.array([
        process_img(cv2.imread(str(img)), 224) / 255. for img in img_list
        if not os.path.exists(str(output_dir / img.stem) + '.npy')
    ])
    print(input_imgs.shape)
    embeddings = encoder.embed_imgs(input_imgs)

    torch.cuda.empty_cache()
    [
        np.save(saved_img_f_list[i], embeddings[i])
        for i in range(len(embeddings))
    ]
예제 #2
0
def get_model():
    model_save_path = Path(
        '/scratch2/chowdh51/Code/DeepTalk/Deployment/encoder/saved_models/model_GST.pt'
    )
    module_name = 'model_GST'
    encoder.load_model(model_save_path, module_name=module_name)
    return encoder
예제 #3
0
def load_model(in_fpath, parser):

	parser.add_argument("-e", "--enc_model_fpath", type=Path, 
		        default="encoder/saved_models/pretrained.pt",
		        help="Path to a saved encoder")
	parser.add_argument("-s", "--syn_model_dir", type=Path, 
		        default="synthesizer/saved_models/logs-pretrained/",
		        help="Directory containing the synthesizer model")
	parser.add_argument("-v", "--voc_model_fpath", type=Path, 
		        default="vocoder/saved_models/pretrained/pretrained.pt",
		        help="Path to a saved vocoder")
	parser.add_argument("--low_mem", action="store_true", help=\
	"If True, the memory used by the synthesizer will be freed after each use. Adds large "
	"overhead but allows to save some GPU memory for lower-end GPUs.")
	parser.add_argument("--no_sound", action="store_true", help=\
	"If True, audio won't be played.")
	args = parser.parse_args()
	encoder.load_model(args.enc_model_fpath)
	synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem)
	vocoder.load_model(args.voc_model_fpath)

	preprocessed_wav = encoder.preprocess_wav(in_fpath)
	original_wav, sampling_rate = librosa.load(in_fpath)
	preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
	embed = encoder.embed_utterance(preprocessed_wav)
	
	return synthesizer, sampling_rate, embed
예제 #4
0
def DeepTalk_encoder(file_path,
                     model_save_path,
                     module_name,
                     preprocess=True,
                     normalize=True,
                     sampling_rate=8000,
                     duration=None):

    encoder.load_model(model_save_path, module_name=module_name)

    if (preprocess):
        wav = Synthesizer.load_preprocess_wav(file_path)
        ref_audio = encoder.preprocess_wav(wav)
    else:
        ref_audio, sr = librosa.load(file_path, sr=sampling_rate)

    if (duration is not None):
        ref_audio = ref_audio[0:int(duration * sampling_rate)]

    embed, partial_embeds, _ = encoder.embed_utterance(ref_audio,
                                                       using_partials=True,
                                                       return_partials=True)

    if (normalize):
        embed = embed / np.linalg.norm(embed)

    return embed
예제 #5
0
 def load_models(self):
     if not torch.cuda.is_available():
             print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
                   "for deep learning, ensure that the drivers are properly installed, and that your "
                   "CUDA version matches your PyTorch installation. CPU-only inference is currently "
                   "not supported.", file=sys.stderr)
             quit(-1)
     device_id = torch.cuda.current_device()
     gpu_properties = torch.cuda.get_device_properties(device_id)
     print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
           "%.1fGb total memory.\n" % 
           (torch.cuda.device_count(),
            device_id,
            gpu_properties.name,
            gpu_properties.major,
            gpu_properties.minor,
            gpu_properties.total_memory / 1e9))
 
 
     ## Load the models one by one.
     print("Preparing the encoder, the synthesizer and the vocoder...")
     encoder.load_model(self.enc_model_fpath)
     print("Loaded Encoder")
     self.synthesizer = Synthesizer(self.syn_model_dir.joinpath("taco_pretrained"), low_mem=self.low_mem)
     print("Loaded Synth")
     vocoder.load_model(self.voc_model_fpath)
     print("Loaded Vocoder")
예제 #6
0
    def __init__(self):
        if (Text2SpeechModel == "dc_tts"):
            self.g = Graph(mode="synthesize")
            print("Text2Speech Tensorflow Graph loaded")
        elif (Text2SpeechModel == "RTVC"):
            enc_model_fpath = os.path.join(
                root_file_path, "RTVC", "encoder/saved_models/pretrained.pt")
            syn_model_dir = os.path.join(
                root_file_path, "RTVC",
                "synthesizer/saved_models/logs-pretrained")
            voc_model_fpath = os.path.join(
                root_file_path, "RTVC",
                "vocoder/saved_models/pretrained/pretrained.pt")
            encoder.load_model(enc_model_fpath)
            self.synthesizer = Synthesizer(os.path.join(
                syn_model_dir, "taco_pretrained"),
                                           low_mem=False)
            vocoder.load_model(voc_model_fpath)
            in_fpath = os.path.join("/",
                                    *root_file_path.split("/")[:-1],
                                    "REF/refaudioRTVC/ref.wav")
            preprocessed_wav = encoder.preprocess_wav(in_fpath)
            original_wav, sampling_rate = librosa.load(in_fpath)
            preprocessed_wav = encoder.preprocess_wav(original_wav,
                                                      sampling_rate)
            embed = encoder.embed_utterance(preprocessed_wav)
            self.embeds = [embed]
        elif (Text2SpeechModel == "AudioSynth"):
            taco_pretrained_config_path = os.path.join(
                root_file_path,
                'AudioSynth/TensorFlowTTS/examples/tacotron2/conf/tacotron2.v1.yaml'
            )
            tacotron2_config = AutoConfig.from_pretrained(
                taco_pretrained_config_path)
            taco_path = os.path.join(root_file_path,
                                     "AudioSynth/tacotron2-120k.h5")
            self.tacotron2 = TFAutoModel.from_pretrained(
                config=tacotron2_config,
                pretrained_path=taco_path,
                training=False,
                name="tacotron2")

            melgan_stft_pretrained_config_path = os.path.join(
                root_file_path,
                'AudioSynth/TensorFlowTTS/examples/melgan.stft/conf/melgan.stft.v1.yaml'
            )
            melgan_stft_config = AutoConfig.from_pretrained(
                melgan_stft_pretrained_config_path)
            melgan_stft_path = os.path.join(root_file_path,
                                            "AudioSynth/melgan.stft-2M.h5")
            self.melgan_stft = TFAutoModel.from_pretrained(
                config=melgan_stft_config,
                pretrained_path=melgan_stft_path,
                name="melgan_stft")
            self.processor = AutoProcessor.from_pretrained(
                pretrained_path=os.path.join(
                    root_file_path, "AudioSynth/ljspeech_mapper.json"))
            mels, alignment_history, audios = do_synthesis(
                "Hello, how can I help you today?", self.tacotron2,
                self.melgan_stft, "TACOTRON", "MELGAN-STFT", self.processor)
예제 #7
0
def transform_embed(wav, encoder_model_fpath=Path()):
    from encoder import inference as encoder
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    wav = encoder.preprocess_wav(wav)
    embed = encoder.embed_utterance(wav)
    return embed
예제 #8
0
def setup():
    global synthesizer
    encoder_weights = Path("encoder/saved_models/pretrained.pt")
    vocoder_weights = Path("vocoder/saved_models/pretrained/pretrained.pt")
    syn_dir = Path("synthesizer/saved_models/logs-pretrained/taco_pretrained")
    encoder.load_model(encoder_weights)
    synthesizer = Synthesizer(syn_dir)
    vocoder.load_model(vocoder_weights)
예제 #9
0
    def init_encoder(self):
        model_fpath = self.ui.current_encoder_fpath

        self.ui.log("Loading the encoder %s... " % model_fpath)
        self.ui.set_loading(1)
        start = timer()
        encoder.load_model(model_fpath)
        self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
        self.ui.set_loading(0)
예제 #10
0
def embed_utterance(fpaths, encoder_model_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    # Compute the speaker embedding of the utterance
    wav_fpath = fpaths
    wav, rate = librosa.load(wav_fpath)
    wav = encoder.preprocess_wav(wav, rate)
    return encoder.embed_utterance(wav)
    def extract_utterance_feats_spkr(self, data_utterance_path, is_full_ppg=False):
        """Get PPG and Mel (+ optional F0) for an utterance.

        Args:
            data_utterance_path: The path to the data utterance protocol buffer.
            is_full_ppg: If True, will use the full PPGs.

        Returns:
            feat_pairs: A list, each is a [pps, mel, dvec(spkr embedding)] pair.
        """
        utt = Utterance()
        fs, wav = wavfile.read(data_utterance_path)
        utt.fs = fs
        utt.wav = wav
        utt.ppg = get_ppg(data_utterance_path, self.ppg_deps)

        audio = torch.FloatTensor(utt.wav.astype(np.float32))
        fs = utt.fs

        if fs != self.stft.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                fs, self.stft.sampling_rate))
        audio_norm = audio / self.max_wav_value
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        # (1, n_mel_channels, T)
        acoustic_feats = self.stft.mel_spectrogram(audio_norm)
        # (n_mel_channels, T)
        acoustic_feats = torch.squeeze(acoustic_feats, 0)
        # (T, n_mel_channels)
        acoustic_feats = acoustic_feats.transpose(0, 1)
        
        #print("encoder model path", self.encoder_model_fpath)
        
        from encoder import inference as encoder
        if not encoder.is_loaded():
            encoder.load_model(self.encoder_model_fpath)
        
        #wav = np.load(data_utterance_path)
        wav = encoder.preprocess_wav(data_utterance_path) # wav
        embed = encoder.embed_utterance(wav)
        #print("spkr embedding", embed)
        #print("shape of ppg, acoustic feats and spkr embedding", (utt.ppg).shape, acoustic_feats.shape, embed.shape)
        
        if is_full_ppg:
            if self.is_append_f0:
                ppg_f0 = append_ppg(utt.ppg, utt.f0)
                return [ppg_f0, acoustic_feats, embed]
            else:
                return [utt.ppg, acoustic_feats, embed]
        else:
            if self.is_append_f0:
                ppg_f0 = append_ppg(utt.monophone_ppg, utt.f0)
                return [ppg_f0, acoustic_feats, embed]
            else:
                return [utt.monophone_ppg, acoustic_feats, embed]
예제 #12
0
def embed_utterance(fpaths, encoder_model_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    # Compute the speaker embedding of the utterance
    wav_fpath, embed_fpath = fpaths
    wav = np.load(wav_fpath)
    wav = encoder.preprocess_wav(wav)
    embed = encoder.embed_utterance(wav)
    np.save(embed_fpath, embed, allow_pickle=False)
예제 #13
0
def get_spk_embed(load_path, enc_model_fpath):

    file_name = load_path.split('/')[-1]
    wav = load_wav(load_path)
    encoder.load_model(enc_model_fpath)
    preprocessed_wav = encoder.preprocess_wav(load_path)
    embed = encoder.embed_utterance(preprocessed_wav)
    spk_embd = torch.tensor(embed).unsqueeze(0)

    return spk_embd, file_name
예제 #14
0
    def get_embed(self, wav):
        # from encoder import inference as encoder
        if not encoder.is_loaded():
            encoder.load_model(self.encoder_model_fpath, device='cpu')
            # 用cpu避免以下报错。
            # "RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the ‘spawn’ start method"

        wav = encoder.preprocess_wav(wav)
        embed = encoder.embed_utterance(wav)
        return embed
예제 #15
0
def embed_utterance(fpaths, encoder_model_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    # Compute the speaker embedding of the utterance
    wav_fpath = embed_fpath = fpaths
    embed_fpath = embed_fpath.replace(".wav", ".npy")
    wav, rate = librosa.load(wav_fpath)
    wav = encoder.preprocess_wav(wav, rate)
    embed = encoder.embed_utterance(wav)
    np.save(embed_fpath, embed, allow_pickle=False)
예제 #16
0
def embed_utterance(src, skip_existing=True, encoder_model_fpath=Path()):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    wav_fpath, embed_fpath = src

    if skip_existing and embed_fpath.is_file():
        return

    wav = aukit.load_wav(wav_fpath, sr=hp.sampling_rate)
    wav = encoder.preprocess_wav(wav)
    embed = encoder.embed_utterance(wav)
    np.save(embed_fpath, embed, allow_pickle=False)
예제 #17
0
    def __init__(self):
        # Info & args
        enc_model_fpath = Path("encoder/saved_models/pretrained.pt")

        syn_model_dir = Path("synthesizer/saved_models/logs-pretrained/")
        voc_model_fpath = Path("vocoder/saved_models/pretrained/pretrained.pt")
        low_mem = False

        ## Load the models one by one.
        print("Preparing the encoder, the synthesizer and the vocoder...")
        encoder.load_model(enc_model_fpath)
        self.synthesizer = Synthesizer(syn_model_dir.joinpath("taco_pretrained"), low_mem=low_mem)
        vocoder.load_model(voc_model_fpath)
예제 #18
0
def embed_utterance(fpaths, encoder_model_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    # Compute the speaker embedding of the utterance
    wav_fpath, embed_fpath, _ = fpaths
    # try:
    #     wav = np.load(wav_fpath)
    # except ValueError as e:
    #     print(e)
    #     wav = np.load(wav_fpath, allow_pickle=True)
    wav = encoder.preprocess_wav(wav_fpath)
    embed = encoder.embed_utterance(wav)
    np.save(embed_fpath, embed, allow_pickle=False)
예제 #19
0
def signup(wav_fpath: Path, username, encoder_model_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    # Compute the speaker embedding of the utterance
    embed_fpath = signup_dir.joinpath(username + ".npy")
    wav = encoder.preprocess_wav(str(wav_fpath))
    embed = encoder.embed_utterance(wav)
    if os.path.exists(embed_fpath):
        old_embed = np.load(embed_fpath)
        embed = old_embed + embed
        embed /= np.linalg.norm(embed, 2)
        os.remove(embed_fpath)
    np.save(embed_fpath, embed, allow_pickle=False)
    print(username + " signed up.")
예제 #20
0
파일: data_utils.py 프로젝트: X-CCS/zhrtvc
def transform_embed(wav, encoder_model_fpath=Path()):
    # from encoder import inference as encoder
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    wav_ = encoder.preprocess_wav(wav)
    # Take segment
    segment_length = 2 * encoder.sampling_rate  # 随机选取2秒语音生成语音表示向量
    if len(wav_) > segment_length:
        max_audio_start = len(wav_) - segment_length
        audio_start = random.randint(0, max_audio_start)
        wav_ = wav_[audio_start:audio_start + segment_length]

    embed = encoder.embed_utterance(wav_)
    return embed
예제 #21
0
def embed_utterance(fpaths, encoder_model_fpath, hparams):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    # Compute the speaker embedding of the utterance
    wav_fpath, embed_fpath = fpaths
    if embed_fpath.exists():
        return
    # wav = np.load(wav_fpath)
    wav, _ = librosa.load(wav_fpath, hparams.sample_rate)
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    wav = encoder.preprocess_wav(wav)
    embed = encoder.embed_utterance(wav)
    np.save(embed_fpath, embed, allow_pickle=False)
예제 #22
0
    def initialize(self):
        print("Running a test of your configuration...\n")
        if not torch.cuda.is_available():
            print(
                "Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
                "for deep learning, ensure that the drivers are properly installed, and that your "
                "CUDA version matches your PyTorch installation. CPU-only inference is currently "
                "not supported.")
            quit(-1)
        print("PyTorch is available and working...")
        device_id = torch.cuda.current_device()
        gpu_properties = torch.cuda.get_device_properties(device_id)
        print(
            "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
            "%.1fGb total memory.\n" %
            (torch.cuda.device_count(), device_id, gpu_properties.name,
             gpu_properties.major, gpu_properties.minor,
             gpu_properties.total_memory / 1e9))
        ## Load the models one by one.

        print("Preparing the encoder, the synthesizer and the vocoder...")
        encoder.load_model(self.enc_model_fpath)

        vocoder.load_model(self.voc_model_fpath)

        ## Run a test
        print("Testing your configuration with small inputs.")
        print("\tTesting the encoder...")
        encoder.embed_utterance(np.zeros(encoder.sampling_rate))

        embed = np.random.rand(speaker_embedding_size)
        embed /= np.linalg.norm(embed)
        embeds = [embed, np.zeros(speaker_embedding_size)]
        texts = ["test 1", "test 2"]
        print(
            "\tTesting the synthesizer... (loading the model will output a lot of text)"
        )
        mels = self.synthesizer.synthesize_spectrograms(texts, embeds)

        mel = np.concatenate(mels, axis=1)
        no_action = lambda *args: None
        print("\tTesting the vocoder...")
        vocoder.infer_waveform(mel,
                               target=200,
                               overlap=50,
                               progress_callback=no_action)
        print("All test passed! You can now synthesize speech.\n\n")
예제 #23
0
    def get_embed(self, wav):
        # from encoder import inference as encoder
        if not encoder.is_loaded():
            encoder.load_model(self.encoder_model_fpath, device='cpu')
            # 用cpu避免以下报错。
            # "RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the ‘spawn’ start method"

        wav_ = encoder.preprocess_wav(wav)
        # Take segment
        segment_length = 2 * encoder.sampling_rate  # 随机选取2秒语音生成语音表示向量
        if len(wav_) > segment_length:
            max_audio_start = len(wav_) - segment_length
            audio_start = random.randint(0, max_audio_start)
            wav_ = wav_[audio_start:audio_start + segment_length]

        embed = encoder.embed_utterance(wav_)
        return embed
예제 #24
0
def embed_utterance(fpaths, encoder_model_fpath, module_name, reject_list_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath, module_name=module_name)

    # Compute the speaker embedding of the utterance
    wav_fpath, embed_fpath = fpaths
    wav = np.load(wav_fpath)
    wav = encoder.preprocess_wav(wav)
    embed = encoder.embed_utterance(wav)

    if(embed is None):
        with open(reject_list_fpath, 'a') as reject_file:
            reject_file.write(str(os.path.basename(embed_fpath))+'\n')
            reject_file.close()
    else:
        if(embed.shape[0]==128):
            embed = np.concatenate((embed, embed), axis=0)
        np.save(embed_fpath, embed, allow_pickle=False)
예제 #25
0
def setup():
    # Parse configs.  Globals nicer in this case
    with open("flowtron/infer.json") as f:
        data = f.read()

    global config
    config = json.loads(data)

    global data_config
    data_config = config["data_config"]
    global model_config
    model_config = config["model_config"]

    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False

    global flowtron
    global waveglow
    global trainset
    
    encoder_weights = Path("encoder/saved_models/pretrained.pt")
    encoder.load_model(encoder_weights)

    torch.manual_seed(1234)
    torch.cuda.manual_seed(1234)

    #Load waveglow
    waveglow = torch.load("flowtron/tacotron2/waveglow/saved_models/waveglow_256channels_universal_v5.pt")['model'].cuda().eval()
    waveglow.cuda().half()
    for k in waveglow.convinv:
        k.float()
    waveglow.eval()
    
    #Load flowtron
    flowtron = Flowtron(**model_config).cuda()
    state_dict = torch.load("flowtron/saved_models/pretrained.pt", map_location='cpu')['model'].state_dict()
    flowtron.load_state_dict(state_dict)
    flowtron.eval()

    ignore_keys = ['training_files', 'validation_files']
    trainset = Data(
        data_config['training_files'],
        **dict((k, v) for k, v in data_config.items() if k not in ignore_keys))
예제 #26
0
    def embed_voice(self):
        encoder.load_model("encoder/saved_models/pretrained.pt")
        in_fpath = Path(self.voice_file)

        ## Computing the embedding
        # First, we load the wav using the function that the speaker encoder provides. This is
        # important: there is preprocessing that must be applied.
        # The following two methods are equivalent:
        # - Directly load from the filepath:
        preprocessed_wav = encoder.preprocess_wav(in_fpath)
        # - If the wav is already loaded:
        original_wav, sampling_rate = librosa.load(in_fpath)
        preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)

        # Then we derive the embedding. There are many functions and parameters that the
        # speaker encoder interfaces. These are mostly for in-depth research. You will typically
        # only use this function (with its default parameters):
        embed = encoder.embed_utterance(preprocessed_wav)
        return embed
예제 #27
0
def gen_from_file(model, load_path, enc_model_fpath, save_path, batched,
                  target, overlap):

    k = model.get_step() // 1000
    file_name = load_path.split('/')[-1]

    wav = load_wav(load_path)
    #save_wav(wav, f'{save_path}__{file_name}__{k}k_steps_target.wav')

    encoder.load_model(enc_model_fpath)
    preprocessed_wav = encoder.preprocess_wav(load_path)
    embed = encoder.embed_utterance(preprocessed_wav)
    spk_embd = torch.tensor(embed).unsqueeze(0)

    mel = melspectrogram(wav)
    mel = torch.tensor(mel).unsqueeze(0)

    batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED'
    #save_str = f'{save_path}__{file_name}__{k}k_steps_{batch_str}_spk_embed.wav'
    save_str = f'{file_name}'
    _ = model.generate(mel, spk_embd, save_str, batched, target, overlap,
                       hp.mu_law)
예제 #28
0
def signin(wav_or_wavpath, encoder_model_fpath):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    wav = encoder.preprocess_wav(wav_or_wavpath)
    embed = encoder.embed_utterance(wav)
    embed = np.reshape(embed, [np.shape(embed)[0], 1])  # [emb_dim, 1]

    signed_spk_embs = list(signup_dir.glob("*.npy"))
    signed_spk_name = [_dir.stem for _dir in signed_spk_embs]
    signed_spk_embs = [np.load(str(_dir)) for _dir in signed_spk_embs]
    signed_spk_embs = np.array(signed_spk_embs)  # [n, emb_dim]

    print(signed_spk_name)
    print(np.shape(signed_spk_embs), np.shape(embed))
    similar_score = np.matmul(signed_spk_embs, embed)
    similar_score = np.reshape(similar_score, [-1])
    sim_id = np.argmax(similar_score)
    sim_name = signed_spk_name[sim_id]
    for name, score in zip(signed_spk_name, similar_score):
        print(name, score)
    print("\nMatching name: ", sim_name)
예제 #29
0
def clone(audio=None, audio_url=None, sentence=""):
    try:
        if not 10 <= len(sentence.split(" ")) <= 30:
            return {"error": "Sentence is invalid! (length must be 10 to 30 words)"}
        audio_data = audio
        if audio_url:
            # Link
            if "http://" in audio_url or "https://" in audio_url:
                header = {'User-Agent': 'Mozilla/5.0 (Windows NT x.y; Win64; x64; rv:9.0) Gecko/20100101 Firefox/10.0'}
                # Check if audio file has less than 5Mb
                r = requests.head(audio_url, headers=header, allow_redirects=True)
                size = r.headers.get('content-length', 0)
                size = int(size) / float(1 << 20)
                log.info("File size: {:.2f} Mb".format(size))
                if size > 10:
                    return {"error": "Input audio file is too large! (max 10Mb)"}
                r = requests.get(audio_url, headers=header, allow_redirects=True)
                audio_data = r.content
            # Base64
            elif len(audio_url) > 500:
                audio_data = base64.b64decode(audio_url)

        audio_path = generate_uid() + ".audio"
        with open(audio_path, "wb") as f:
            f.write(audio_data)

        # Load the models one by one.
        log.info("Preparing the encoder, the synthesizer and the vocoder...")
        encoder.load_model(Path("rtvc/encoder/saved_models/pretrained.pt"))
        synthesizer = Synthesizer(Path("rtvc/synthesizer/saved_models/logs-pretrained/taco_pretrained"))
        vocoder.load_model(Path("rtvc/vocoder/saved_models/pretrained/pretrained.pt"))

        # Computing the embedding
        original_wav, sampling_rate = librosa.load(audio_path)
        preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
        log.info("Loaded file successfully")

        if os.path.exists(audio_path):
            os.remove(audio_path)

        embed = encoder.embed_utterance(preprocessed_wav)
        log.info("Created the embedding")

        specs = synthesizer.synthesize_spectrograms([sentence], [embed])
        spec = np.concatenate(specs, axis=1)
        # spec = specs[0]
        log.info("Created the mel spectrogram")

        # Generating the waveform
        log.info("Synthesizing the waveform:")
        generated_wav = vocoder.infer_waveform(spec, progress_callback=lambda *args: None)

        # Post-generation
        # There's a bug with sounddevice that makes the audio cut one second earlier, so we
        # pad it.
        generated_wav = np.pad(generated_wav,
                               (0, synthesizer.sample_rate),
                               mode="constant")

        # Save it on the disk
        fp = tempfile.TemporaryFile()
        librosa.output.write_wav(fp, generated_wav.astype(np.float32), synthesizer.sample_rate)
        return {"audio": fp.read()}

    except Exception as e:
        log.error(e)
        traceback.print_exc()
        return {"error": "Fail"}
예제 #30
0
            "CUDA version matches your PyTorch installation. CPU-only inference is currently "
            "not supported.",
            file=sys.stderr)
        quit(-1)
    device_id = torch.cuda.current_device()
    gpu_properties = torch.cuda.get_device_properties(device_id)
    print(
        "Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
        "%.1fGb total memory.\n" %
        (torch.cuda.device_count(), device_id, gpu_properties.name,
         gpu_properties.major, gpu_properties.minor,
         gpu_properties.total_memory / 1e9))

    ## Load the models one by one.
    print("Preparing the encoder, the synthesizer and the vocoder...")
    encoder.load_model(args.enc_model_fpath)
    synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"),
                              low_mem=args.low_mem)
    #vocoder.load_model(args.voc_model_fpath)

    ## Run a test
    print("Testing your configuration with small inputs.")
    # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
    # sampling rate, which may differ.
    # If you're unfamiliar with digital audio, know that it is encoded as an array of floats
    # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
    # The sampling rate is the number of values (samples) recorded per second, it is set to
    # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
    # to an audio of 1 second.
    print("\tTesting the encoder...")
    encoder.embed_utterance(np.zeros(encoder.sampling_rate))