def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16): mel_files = files_to_list(mel_files) waveglow = torch.load(waveglow_path)['model'] waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval() if is_fp16: waveglow.half() for k in waveglow.convinv: k.float() for i, file_path in enumerate(mel_files): stime = time.time() file_name = os.path.splitext(os.path.basename(file_path))[0] mel = torch.load(file_path) mel = torch.autograd.Variable(mel.cuda()) mel = torch.unsqueeze(mel, 0) mel = mel.half() if is_fp16 else mel stime2 = time.time() with torch.no_grad(): audio = MAX_WAV_VALUE * waveglow.infer(mel, sigma=sigma)[0] inf_time = time.time() - stime2 audio = audio.cpu().numpy() audio = audio.astype('int16') audio_path = os.path.join(output_dir, "{}_synthesis.wav".format(file_name)) write(audio_path, sampling_rate, audio) #print(audio_path) len_audio = len(audio) / 22050. print( "{}: (audio length {:.2f} sec), (total computing time {:.2f} sec), (inference time: {:.2f} sec) " .format(audio_path, len_audio, time.time() - stime, inf_time))
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16): mel_files = files_to_list(mel_files) waveglow = torch.load(waveglow_path)['model'] waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval() if is_fp16: waveglow.half() for k in waveglow.convinv: k.float() for i, file_path in enumerate(mel_files): file_name = os.path.splitext(os.path.basename(file_path))[0] mel = torch.load(file_path) mel = torch.autograd.Variable(mel.cuda()) mel = torch.unsqueeze(mel, 0) mel = mel.half() if is_fp16 else mel #mel = torch.transpose(mel,1,2) #print(mel.size()) with torch.no_grad(): audio = MAX_WAV_VALUE * waveglow.infer(mel, sigma=sigma)[0] audio = audio.cpu().numpy() audio = audio.astype('int16') audio_path = os.path.join(output_dir, "{}_synthesis.wav".format(file_name)) write(audio_path, sampling_rate, audio) print(audio_path)
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength): mel_files = files_to_list(mel_files) waveglow = torch.load(waveglow_path)['model'] waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval() if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() for i, file_path in enumerate(mel_files): file_name = os.path.splitext(os.path.basename(file_path))[0] mel = torch.load(file_path) mel = torch.autograd.Variable(mel.cuda()) mel = torch.unsqueeze(mel, 0) mel = mel.half() if is_fp16 else mel with torch.no_grad(): audio = waveglow.infer(mel, sigma=sigma) if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) audio = audio * MAX_WAV_VALUE audio = audio.squeeze() audio = audio.cpu().numpy() audio = audio.astype('int16') audio_path = os.path.join(output_dir, "{}_synthesis.wav".format(file_name)) write(audio_path, sampling_rate, audio) print(audio_path)
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength): mel_files = files_to_list(mel_files) waveglow = torch.load(waveglow_path)['model'] for m in waveglow.modules(): if 'Conv' in str(type(m)): setattr(m, 'padding_mode', 'zeros') waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval() if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() for i, file_path in enumerate(mel_files): file_name = os.path.splitext(os.path.basename(file_path))[0] #print(file_name) mel = torch.load(file_path) # print("mel",mel) #print(mel.shape) mel = torch.autograd.Variable(mel.cuda()) # print("mel",mel) mel = torch.unsqueeze(mel, 0) mel = mel.half() if is_fp16 else mel # print("mel",mel) print(torch.min(mel),torch.max(mel)) with torch.no_grad(): audio = waveglow.infer(mel, sigma=sigma) if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) k.append(abs(audio).max().item()) #print(min(k),max(k)) #audio = audio*18000*abs(audio).max()/0.99 #print("audio",audio) #print((audio).min().item(),(audio).max().item()) audio = audio * MAX_WAV_VALUE audio = audio.squeeze() audio = audio.cpu().numpy() audio = audio.astype('int16') audio_path = os.path.join( output_dir, "{}_synthesis_sig0.7_d_0.1.wav".format(file_name)) write(audio_path, sampling_rate, audio) print(audio_path)
def __init__(self, training_files, num_frame, filter_length, hop_length, win_length, sampling_rate, mel_fmin, mel_fmax): self.audio_files = ms.files_to_list(training_files) self.all_length = torch.tensor(0, dtype=torch.long) self.hop_length = hop_length self.win_length = win_length random.seed(4321) random.shuffle(self.audio_files) self.stft = ms.TacotronSTFT(filter_length=filter_length, hop_length=self.hop_length, win_length=self.win_length, sampling_rate=sampling_rate, mel_fmin=mel_fmin, mel_fmax=mel_fmax) self.segment = num_frame * self.hop_length + self.win_length self.sampling_rate = sampling_rate
def main(mel_files, squeezewave_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength): tic_prepare= time.time() mel_files = files_to_list(mel_files) squeezewave = torch.load(squeezewave_path)['model'] squeezewave = squeezewave.remove_weightnorm(squeezewave) squeezewave.cuda().eval() if is_fp16: from apex import amp squeezewave, _ = amp.initialize(squeezewave, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(squeezewave).cuda() toc_prepare = time.time() dur_prepare = toc_prepare - tic_prepare print("prepare model {:3.2}sec".format(dur_prepare) ) for i, file_path in enumerate(mel_files): file_name = os.path.splitext(os.path.basename(file_path))[0] mel = torch.load(file_path) mel = torch.autograd.Variable(mel.cuda()) mel = torch.unsqueeze(mel, 0) mel = mel.half() if is_fp16 else mel tic=time.time() with torch.no_grad(): audio = squeezewave.infer(mel, sigma=sigma).float() if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) audio = audio * MAX_WAV_VALUE toc=time.time() dur = toc -tic audio = audio.squeeze() audio = audio.cpu().numpy() len_wav = len(audio) sec_wav = len_wav/sampling_rate samples_sec = len_wav / dur audio = audio.astype('int16') audio_path = os.path.join( output_dir, "{}_s{}.wav".format(file_name,sigma)) write(audio_path, sampling_rate, audio) print("{} it took {:4.3f}sec for {:4.3f}sec {:4.2f}K sample 22Khz Audio files : RTF {:4.3f} {:4.3f}X {:4.2f}Ksamples/sec " .format(audio_path, dur, sec_wav, len_wav/1000, dur/sec_wav, sec_wav/dur , samples_sec/1000 ) )
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength): mel_files = files_to_list(mel_files) #测试集mel谱list waveglow = torch.load(waveglow_path)['model'] #加载模型 waveglow = waveglow.remove_weightnorm(waveglow) #?移除权重归一化 waveglow.cuda().eval() #cuda()拷贝进gpu #?变成测试模式,dropout和BN在训练时和测不一样 #apex加速 if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") # denoiser_strength=0 if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() for i, file_path in enumerate(mel_files): #file_name-对应的wav file_name = os.path.splitext(os.path.basename(file_path))[0] #加载MFCC特征,80个滤波器 mel = torch.load(file_path) #mel={key:mel[key].cuda() for key in mel} #封装数据 mel = torch.autograd.Variable(mel.cuda()) #80,375 -> 1*80*375 mel = torch.unsqueeze(mel, 0) #变成fp16数据以便apex加速 mel = mel.half() if is_fp16 else mel #反向传播不会自动求导 with torch.no_grad(): #生成1*96000Tensor数据,x为原始音频,z为mel谱 audio = waveglow.infer(mel, sigma=sigma) if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) #为了转成wav? audio = audio * MAX_WAV_VALUE #变成1维数据 audio = audio.squeeze() #在cpu中转成numpy audio = audio.cpu().numpy() #改变类型 audio = audio.astype('int16') #生成数据存储位置 audio_path = os.path.join(output_dir, "{}_synthesis.wav".format(file_name)) write(audio_path, sampling_rate, audio) #写入音频 print(audio_path)
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength): mel_files = files_to_list(mel_files) waveglow = torch.load(waveglow_path)['model'] waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval() if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() for i, file_path in enumerate(mel_files): file_name = os.path.splitext(os.path.basename(file_path))[0] print('Loading file: ', file_path) if file_path.find('.pt') != -1: print('load by torch') mel = torch.load(file_path) elif file_path.find('.npy') != -1: print('load by numpy') mel = np.load(file_path) mel = torch.from_numpy(mel) print(f"original mel shape: {mel.shape}") mel = torch.autograd.Variable(mel.cuda()) mel = torch.unsqueeze(mel, 0) mel = mel.half() if is_fp16 else mel print(f"mel shape right before using waveglow: {mel.shape}") with torch.no_grad(): audio = waveglow.infer(mel, sigma=sigma) if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) audio = audio * MAX_WAV_VALUE audio = audio.squeeze() audio = audio.cpu().numpy() audio = audio.astype('int16') if not os.path.exists(output_dir): os.mkdir(output_dir) audio_path = os.path.join(output_dir, "{}_synthesis.wav".format(file_name)) write(audio_path, sampling_rate, audio) print(audio_path)
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength): mel_files = files_to_list(mel_files) waveglow = torch.load(waveglow_path)['model'] waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval() if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() for i, file_path in enumerate(mel_files): file_name = os.path.splitext(os.path.basename(file_path))[0] if True: # Processing for generic mel files shape = tuple(np.fromfile(file_path, count=2, dtype=np.int32)) mel = np.memmap(file_path, offset=8, dtype=np.float32, shape=shape) # mel = mel[1:1000,:] mel = mel.transpose() mel = torch.from_numpy(mel) else: mel = torch.load(file_path) mel = torch.autograd.Variable(mel.cuda()) mel = torch.unsqueeze(mel, 0) mel = mel.half() if is_fp16 else mel with torch.no_grad(): audio = waveglow.infer(mel, sigma=sigma) if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) audio = audio * MAX_WAV_VALUE audio = audio.squeeze() audio = audio.cpu().numpy() audio = audio.astype('int16') audio_path = os.path.join(output_dir, "{}.wav".format(file_name)) write(audio_path, sampling_rate, audio) print(audio_path)
def main(mel_files, squeezewave_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength): mel_files = files_to_list(mel_files) #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') device = torch.device('cpu') squeezewave = torch.load(squeezewave_path, map_location=device)['model'] squeezewave = squeezewave.remove_weightnorm(squeezewave) squeezewave.eval() if is_fp16: from apex import amp squeezewave, _ = amp.initialize(squeezewave, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(squeezewave) start = time.time() for i, file_path in enumerate(mel_files): file_name = os.path.splitext(os.path.basename(file_path))[0] mel = torch.load(file_path, map_location=device) mel = torch.autograd.Variable(mel) mel = mel.half() with torch.no_grad(): audio = squeezewave.infer(mel, sigma=sigma).float() if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) audio = audio * MAX_WAV_VALUE audio = audio.squeeze() audio = audio.cpu().numpy() audio = audio.astype('int16') audio_path = os.path.join(output_dir, "{}_synthesis.wav".format(file_name)) write(audio_path, sampling_rate, audio) print(audio_path) end = time.time() print("Squeezewave vocoder time") print(end - start)
def main(mel_files, squeezewave_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength, device): mel_files = files_to_list(mel_files) squeezewave = torch.load(squeezewave_path, map_location=device)['model'] squeezewave.device = device # hack for loading model trained on gpu to cpu squeezewave = squeezewave.remove_weightnorm(squeezewave) squeezewave.to(device=device).eval() if is_fp16: from apex import amp squeezewave, _ = amp.initialize(squeezewave, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(squeezewave).to(device=device) for i, file_path in enumerate(mel_files): file_name = os.path.splitext(os.path.basename(file_path))[0] mel = torch.load(file_path) if len(mel.shape) > 2: mel = mel.squeeze() print(f"squeezed to {mel.shape}") assert len(mel.shape) == 2 mel = torch.autograd.Variable(mel.to(device=device)) mel = torch.unsqueeze(mel, 0) mel = mel.half() if is_fp16 else mel with torch.no_grad(): audio = squeezewave.infer(mel, sigma=sigma).float() if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) audio = audio * MAX_WAV_VALUE audio = audio.squeeze() audio = audio.cpu().numpy() audio = audio.astype('int16') audio_path = os.path.join(output_dir, "{}_synthesis.wav".format(file_name)) write(audio_path, sampling_rate, audio) print(audio_path)
import argparse import os import subprocess from mel2samp import files_to_list if __name__ == "__main__": # Get defaults so it can work with no Sacred parser = argparse.ArgumentParser() parser.add_argument('-f', "--filelist_path", required=True) parser.add_argument('-o', '--output_dir', type=str, help='Output directory') args = parser.parse_args() filepaths = files_to_list(args.filelist_path) for filepath in filepaths: source_wav = filepath filename = os.path.basename(filepath) dest_wav = f'{args.output_dir}/{filename}' command = f'sox {source_wav} {dest_wav} remix 1,2 rate 22050' subprocess.run(command, shell=True)