def generateAudioGroup(original_audio, ref_audios, autovc_checkpoint = 'checkpoints_fully/autovc_700000.pt', vocoder_checkpoint = "../checkpoint_step001000000_ema.pth"):

    mel_org = makeSpect(original_audio, None)

    def pad_seq(x, base=32):
        len_out = int(base * ceil(float(x.shape[0])/base))
        len_pad = len_out - x.shape[0]
        assert len_pad >= 0
        return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad

    device = 'cuda:0'
    G = Generator(32,256,512,32).eval().to(device)

    g_checkpoint = torch.load(autovc_checkpoint, map_location=torch.device('cuda'))
    
    G = g_checkpoint.eval()

    x_org = mel_org
    x_org, len_pad = pad_seq(x_org)
    uttr_org = torch.FloatTensor(x_org[np.newaxis, :, :]).to(device)

    emb_org = get_verification_pytorch_1000(original_audio)
    emb_refs = []
    i = 0
    
    for file in os.listdir(ref_audios):
        i += 1
        print("{}/{}".format(i, len(os.listdir(ref_audios))))
    
        emb_ref = get_verification_pytorch_1000(ref_audios + file, 1)
        if emb_ref is not None: emb_refs.append(emb_ref)
        
   
    emb_refs = np.mean(emb_refs, axis=0)
    
    emb_org = torch.FloatTensor(emb_org).unsqueeze(0).cuda()
    emb_refs = torch.FloatTensor(emb_refs).unsqueeze(0).cuda()
    
    with torch.no_grad():
        _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_refs)

    if len_pad == 0:
        uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
    else:
        uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()


    device = torch.device("cuda")
    model = build_model().to(device)
    checkpoint = torch.load(vocoder_checkpoint, map_location=torch.device('cuda'))
    model.load_state_dict(checkpoint["state_dict"])

    waveform = wavegen(model, c=uttr_trg)   
    return waveform
def generateAudio(original_audio, ref_audio, autovc_checkpoint, vocoder_checkpoint ,english=False):

    mel_org = makeSpect(original_audio, None)

    def pad_seq(x, base=32):
        len_out = int(base * ceil(float(x.shape[0])/base))
        len_pad = len_out - x.shape[0]
        assert len_pad >= 0
        return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad

    device = 'cuda:0'
    G = Generator(32,256,512,32).eval().to(device)

    g_checkpoint = torch.load(autovc_checkpoint, map_location=torch.device('cuda'))
    
    G = g_checkpoint.eval()

    x_org = mel_org
    x_org, len_pad = pad_seq(x_org)
    uttr_org = torch.FloatTensor(x_org[np.newaxis, :, :]).to(device)

    emb_org = get_verification_pytorch_1000(original_audio)
    
    if not english:
        emb_ref = get_verification_pytorch_1000(ref_audio)
    else:
        emb_ref = get_verification_eng(ref_audio)
        
    if emb_org is None or emb_ref is None: return None
   
    emb_org = torch.FloatTensor(emb_org).unsqueeze(0).cuda()
    if not english:
        emb_ref = torch.FloatTensor(emb_ref).unsqueeze(0).cuda()
    else:
        emb_ref = emb_ref.type(torch.cuda.FloatTensor)
    
    with torch.no_grad():
        _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_ref)

    if len_pad == 0:
        uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
    else:
        uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()


    device = torch.device("cuda")
    model = build_model().to(device)
    checkpoint = torch.load(vocoder_checkpoint, map_location=torch.device('cuda'))
    model.load_state_dict(checkpoint["state_dict"])

    waveform = wavegen(model, c=uttr_trg)   
    return waveform
Exemplo n.º 3
0
    def __decode__(self):

        spect_vc = pickle.load(open('results.pkl', 'rb'))
        #device = torch.device("cuda")
        model = build_model()#.to(device)
        checkpoint = torch.load("checkpoint_step001000000_ema.pth", map_location=torch.device('cpu'))
        model.load_state_dict(checkpoint["state_dict"])

        for spect in spect_vc:
            name = spect[0]
            c = spect[1]
            print(name)
            waveform = wavegen(model, c=c)

            save_path = os.path.join("audio/download/audio.wav")
            librosa.output.write_wav(save_path, waveform, sr=16000)

        return save_path
Exemplo n.º 4
0
    args = parser.parse_args()
    
    output_path = args.output
    src_wav_path = args.src_wav
    src_emb_path = args.src_emb
    tgt_emb_path = args.tgt_emb
    vocoder_checkpoint_path = args.vocoder
    autovc_checkpoint_path = args.autovc

    dim_neck = 32
    dim_emb = 256
    dim_pre = 512
    freq = 32

    device = torch.device('cpu')
    wavnet = build_model().to(device)
    checkpoint = torch.load(vocoder_checkpoint_path, map_location=device)
    wavnet.load_state_dict(checkpoint["state_dict"])

    wav = load_wav(src_wav_path)
    emb = np.load(src_emb_path)
    emb_tgt = np.load(tgt_emb_path)

    mel = melspectrogram(wav)

    pad_len = math.ceil(mel.shape[1] / 32) * 32 - mel.shape[1]
    mel = np.pad(mel, ((0,0), (0, pad_len)), mode='constant')

    mel = torch.FloatTensor(mel)
    emb = torch.FloatTensor(emb)
    emb_tgt = torch.FloatTensor(emb_tgt)
Exemplo n.º 5
0

device = 'cuda:0'

g_checkpoint = torch.load(autovc_checkpoint, map_location=torch.device('cuda'))
G = g_checkpoint

x_org = mel_org
x_org, len_pad = pad_seq(x_org)
uttr_org = torch.FloatTensor(x_org[np.newaxis, :, :]).to(device)

with torch.no_grad():
    _, x_identic_psnt, _ = G(uttr_org, emb_ref)

if len_pad == 0:
    uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy()
else:
    uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy()

device = torch.device("cuda")
model = build_model().to(device)
checkpoint = torch.load(
    "../drive/MyDrive/MultiSpeaker_Tacotron2/checkpoint_step001000000_ema.pth",
    map_location=torch.device('cuda'))
model.load_state_dict(checkpoint["state_dict"])

waveform = wavegen(model, c=uttr_trg)
sf.write('{}-{}.wav'.format(original_name, ref_name),
         waveform,
         16000,
         subtype='PCM_24')
Exemplo n.º 6
0
if os.path.exists(subdir_for_wavs) == False:
    os.makedirs(subdir_for_wavs)

sys.path.insert(
    1, '/homes/bdoc3/my_data/autovc_data'
)  # usually the cwd is priority, so index 1 is good enough for our purposes here
from hparams import hparams

import torch
import librosa
import soundfile as sf
import pickle
from synthesis import build_model
from synthesis import wavegen

model = build_model().to(config.device)

checkpoint = torch.load(
    "/homes/bdoc3/my_data/autovc_data/checkpoint_step001000000_ema.pth")
model.load_state_dict(checkpoint["state_dict"])
model.to(config.device)
counter = 0

_, _, fileList = next(os.walk(config.spmel_dir))

numpy_list = []
for numpy_name in fileList:
    spmel = np.load(os.path.join(config.spmel_dir,
                                 numpy_name))[:config.len_crop]
    numpy_list.append((numpy_name[:-4], torch.tensor(spmel).to(config.device)))
Exemplo n.º 7
0
def load_model():
    model = build_model().to(device)
    checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint["state_dict"])
    return model
 def __init__(self,
              device="cpu",
              model_path="checkpoint_step001000000_ema.pth"):
     self.device = device
     self.model = build_model().to(device)
     self.model.load_state_dict(torch.load(model_path)["state_dict"])