def create_models(dataset_dir): """Initialize the app (available for localhost only) Parameters: dataset_dir (:func:`str`): Path to the training set """ logger.debug("Creating models...") if not local_exec: logger.error("Models can only be generated locally") exit(1) # Modify the configuration for local execution app_config['root'] = os.environ['ROOT'] # Generate inline models and train classifier denoiser = Denoiser(app_config) if not exists(dataset_dir) or not isdir(dataset_dir): logger.error(dataset_dir+" is not a valid directory") exit(2) dataset = [join(dataset_dir, f) for f in listdir(dataset_dir)] denoiser.generate_models(dataset) logger.info("Inline models generated") denoiser.train(dataset) logger.info("Classifier trained")
def __init__(self, ds_name, ds_path, lr, iterations, batch_size, print_freq, k, eps, is_normalized, adv_momentum, store_adv=None, load_adv_dir=None, load_adv_name=None, load_dir=None, load_name=None, save_dir=None): self.data_processor = Preprocessor(ds_name, ds_path, is_normalized) # Load Data self.train_data, self.test_data, self.N_train, self.N_test = self.data_processor.datasets( ) self.train_loader = DataLoader(self.train_data, batch_size=batch_size, shuffle=True) self.test_loader = DataLoader(self.test_data, batch_size=batch_size) # Other Variables self.save_dir = save_dir self.store_adv = store_adv # Set Model Hyperparameters self.learning_rate = lr self.iterations = iterations self.print_freq = print_freq self.cuda = torch.cuda.is_available() # Load Model to Conduct Adversarial Training adversarial_model = self.load_model(self.cuda, load_adv_dir, load_adv_name, TEST) self.adversarial_generator = Attacks(adversarial_model, eps, self.N_train, self.N_test, self.data_processor.get_const(), adv_momentum, is_normalized, store_adv) # Load Target Model self.target_model = self.load_model(self.cuda, load_dir, load_name, TEST) # Load Denoiser self.denoiser = Denoiser(x_h=32, x_w=32) self.denoiser = self.denoiser.cuda()
def test_reducing_by_stats(audio_file, out_lib): y, sr = sf.read(audio_file) y_power = Denoiser.reduce_noise_power(y, sr) y_cent_s = Denoiser.reduce_noise_centroid_s(y, sr) y_cent_mb = Denoiser.reduce_noise_centroid_mb(y, sr) y_mfcc_d = Denoiser.reduce_noise_mfcc_down(y, sr) y_mfcc_u = Denoiser.reduce_noise_mfcc_up(y, sr) sf.write(out_lib + '/power.wav', y_power, sr) sf.write(out_lib + '/cent_s.wav', y_cent_s, sr) sf.write(out_lib + '/cent_mb.wav', y_cent_mb, sr) sf.write(out_lib + '/mfcc_d.wav', y_mfcc_d, sr) sf.write(out_lib + '/mfcc_u.wav', y_mfcc_u, sr)
def reduce_by_example_to_mp3(audio_file, noise_file, out_file): # data1, rate1 = sf.read(audio_file) # noise_data1, _ = sf.read(noise_file) audio = AudioSegment.from_wav(audio_file) noise_audio = AudioSegment.from_wav(noise_file) data = Denoiser.seg_to_numpy(audio) noise_data = Denoiser.seg_to_numpy(noise_audio) rate = audio.frame_rate denoised_data = Denoiser.reduce_by_example(data, noise_data, rate) denoised_audio = Denoiser.numpy_to_seg_like_seg(denoised_data, audio) denoised_audio.export(out_file, format='mp3')
def main(waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength): waveglow = torch.load(waveglow_path)['model'] waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval() if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() for i, file_path in enumerate(glob.glob('*.npy')): file_name = os.path.splitext(os.path.basename(file_path))[0] mel = torch.from_numpy(np.load(file_path)) mel = torch.unsqueeze(mel, 0).cuda() mel = mel.half() if is_fp16 else mel with torch.no_grad(): audio = waveglow.infer(mel, sigma=sigma) if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) audio = audio * MAX_WAV_VALUE audio = audio.squeeze().cpu().numpy() audio_path = os.path.join(output_dir, f'waveglow_{file_name}.wav') write(audio_path, sampling_rate, audio.astype('int16'))
def test_reducing_by_example(audio_file, noise_file, out_file): data, rate = sf.read(audio_file) noise_data, _ = sf.read(noise_file) denoised_audio = Denoiser.reduce_by_example(data, noise_data, rate) sf.write(out_file, denoised_audio, rate)
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength): mel_files = files_to_list(mel_files) waveglow = torch.load(waveglow_path)['model'] waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval() if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() for i, file_path in enumerate(mel_files): file_name = os.path.splitext(os.path.basename(file_path))[0] mel = torch.load(file_path) mel = torch.autograd.Variable(mel.cuda()) mel = torch.unsqueeze(mel, 0) mel = mel.half() if is_fp16 else mel with torch.no_grad(): audio = waveglow.infer(mel, sigma=sigma) if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) audio = audio * MAX_WAV_VALUE audio = audio.squeeze() audio = audio.cpu().numpy() audio = audio.astype('int16') audio_path = os.path.join(output_dir, "{}_synthesis.wav".format(file_name)) write(audio_path, sampling_rate, audio) print(audio_path)
def main(args): checkpoint = torch.load(args.checkpoint_path) if args.config is not None: hp = HParam(args.config) else: hp = load_hparam_str(checkpoint['hp_str']) model = ModifiedGenerator(hp.audio.n_mel_channels, hp.model.n_residual_layers, ratios=hp.model.generator_ratio, mult = hp.model.mult, out_band = hp.model.out_channels).cuda() model.load_state_dict(checkpoint['model_g']) model.eval(inference=True) with torch.no_grad(): mel = torch.from_numpy(np.load(args.input)) if len(mel.shape) == 2: mel = mel.unsqueeze(0) mel = mel.cuda() audio = model.inference(mel) audio = audio.squeeze(0) # collapse all dimension except time axis if args.d: denoiser = Denoiser(model).cuda() audio = denoiser(audio, 0.01) audio = audio.squeeze() audio = audio[:-(hp.audio.hop_length*10)] audio = MAX_WAV_VALUE * audio audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE-1) audio = audio.short() audio = audio.cpu().detach().numpy() out_path = args.input.replace('.npy', '_reconstructed_epoch%04d.wav' % checkpoint['epoch']) write(out_path, hp.audio.sampling_rate, audio)
def main(args): checkpoint = torch.load(args.checkpoint_path) if args.config is not None: hp = HParam(args.config) else: hp = load_hparam_str(checkpoint['hp_str']) model = Generator(hp.audio.n_mel_channels).cuda() model.load_state_dict(checkpoint['model_g']) model.eval() with torch.no_grad(): mel = torch.from_numpy(np.load(args.input)) if len(mel.shape) == 2: mel = mel.unsqueeze(0) mel = mel.cuda() audio = model(mel) # For multi-band inference print(audio.shape) audio = audio.squeeze(0) # collapse all dimension except time axis if args.d: denoiser = Denoiser(model).cuda() audio = denoiser(audio, 0.1) audio = audio.squeeze() audio = audio[:-(hp.audio.hop_length * 10)] audio = MAX_WAV_VALUE * audio audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE - 1) audio = audio.short() audio = audio.cpu().detach().numpy() out_path = args.input.replace( '.npy', '_hifi_GAN_epoch%04d.wav' % checkpoint['epoch']) write(out_path, hp.audio.sampling_rate, audio)
def load_waveglow(chk_pt_path): waveglow = torch.load(chk_pt_path)['model'] waveglow.cuda().eval().half() for k in waveglow.convinv: k.float() denoiser = Denoiser(waveglow) return waveglow, denoiser
def load_model(self): ####TODO#### 1.학습된 모델 불러오기 # 학습된 tacotron 모델 주소를 load하고 # 모델에 hparam과 statedict를 load한다 checkpoint_path = "/home/ubuntu/test/TTS/checkpoint_28000" self.model = train.load_model(self.hparams) self.model.load_state_dict( torch.load(checkpoint_path, map_location=torch.device("cpu"))['state_dict']) # pass ####TODO#### # _ = self.model.cpu().eval().half() _ = self.model.cpu().eval() #waveglow model load # waveglow_path = "/home/multicam/checkpoints/waveglow.pt" waveglow_path = "/home/ubuntu/test/TTS/waveglow.pt" self.waveglow = torch.load(waveglow_path, map_location=torch.device("cpu"))['model'] self.waveglow.cpu().eval() #self.waveglow.cpu().eval().half() for k in self.waveglow.convinv: k.float() self.denoiser = Denoiser(self.waveglow)
class TXTDenoiser(Command): """Command to clean TXT files """ def __init__(self, filename, logger, config): super(TXTDenoiser, self).__init__(filename, logger, config) self.denoiser = Denoiser(config) self.logger.debug("Denoiser initialized") def execute(self): """Execute the command """ try: self.logger.debug("::: Text cleaning :::") # super(TXTDenoiser, self).get_file() txt_dir = join(self.unzipped, "txt") txt_files = [ join(txt_dir, f) for f in listdir(txt_dir) if isfile(join(txt_dir, f)) and f.endswith(".txt") ] if len(txt_files) != 1: self.logger.error("Incorrect number of text files") self.finalize() return -1 text_data = self.denoiser.cleanse(txt_files[0], False) # Writing classified lines base_filename = splitext(basename(txt_files[0]))[0] clean_filename = join(txt_dir, base_filename + ".clean.txt") garbage_filename = join(txt_dir, base_filename + ".grbge.txt") unclassified_filename = join(txt_dir, base_filename + ".unclss.txt") with codecs.open(clean_filename, "wb", encoding="utf-8") as clean_file: for line in text_data.get_clean_lines(): clean_file.write(line + "\n") with codecs.open(garbage_filename, "wb", encoding="utf-8") as garbage_file: for line in text_data.get_garbage_lines(): garbage_file.write(line + "\n") if len(text_data.get_unclassified_lines()) > 0: with codecs.open(unclassified_filename, "wb", encoding="utf-8") as unclassified_file: for line in text_data.get_unclassified_lines(): unclassified_file.write(line + "\n") except Exception, e: print e self.logger.error("Cleaner has stopped unexpectedly: " + e.message) self.finalize() return -2 self.finalize() return 0
def check_exec_time(data, noise, rate, out_file): start_time = time.time() denoised = Denoiser.reduce_by_example(data, noise, rate) sf.write(out_file, denoised, rate) return (len(denoised) / rate, time.time() - start_time)
def generate_from_file(tacotron2_path, waveglow_path, text_file, output_directory): # Make synthesis paths if not os.path.exists(output_directory): os.makedirs(output_directory) print("Creating directory " + output_directory + "...") hparams = create_hparams() hparams.sampling_rate = 22050 print("Loading models...") model = load_model(hparams) model.load_state_dict(torch.load(tacotron2_path)['state_dict']) _ = model.cuda().eval().half() waveglow = torch.load(waveglow_path)['model'] waveglow.cuda().eval().half() for k in waveglow.convinv: k.float() denoiser = Denoiser(waveglow) genlist = [] with open(text_file) as file: for line in file: genlist.append(line.strip()) for entry in genlist: wav_name = "_".join(entry.split(" ")[:4]).lower() + ".wav" epi = epitran.Epitran('eng-Latn', ligatures = True) if hparams.preprocessing == "ipa": entry = ipa.convert(english_cleaners(entry)) foreign_words = re.findall(r"[^ ]{0,}\*", entry) for word in foreign_words: entry = entry.replace(word, epi.transliterate(word[0:len(word)-1])) if hparams.preprocessing == "arpabet": entry = make_arpabet(entry) # Text sequencer if hparams.preprocessing is not None: sequence = np.array(text_to_sequence(entry, None))[None, :] else: sequence = np.array(text_to_sequence(entry, ['english_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() # Synthesis mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) with torch.no_grad(): audio = waveglow.infer(mel_outputs_postnet, sigma=0.666) audio_denoised = denoiser(audio, strength=0.01)[:, 0] # Save audio print ("Saving " + wav_name) write(os.path.join(output_directory, wav_name), hparams.sampling_rate, audio_denoised[0].data.cpu().numpy())
def test_reducing_by_length(audio_file, noise_file, out_lib): data, rate = sf.read(audio_file) noise, _ = sf.read(noise_file) croped_noise = noise[:] length = noise.shape[0] for i in range(3): croped_noise = noise[:length >> i] denoised = Denoiser.reduce_by_example(data, croped_noise, rate) print(croped_noise.shape[0] / rate) sf.write(out_lib + 'denoised{}.wav'.format(i), denoised, rate)
def create_models(dataset_dir): """Initialize the app (available for localhost only) Parameters: dataset_dir (:func:`str`): Path to the training set """ logger.debug("Creating models...") if not local_exec: logger.error("Models can only be generated locally") exit(1) # Modify the configuration for local execution app_config['root'] = os.environ['ROOT'] # Generate inline models and train classifier denoiser = Denoiser(app_config) if not exists(dataset_dir) or not isdir(dataset_dir): logger.error(dataset_dir + " is not a valid directory") exit(2) dataset = [join(dataset_dir, f) for f in listdir(dataset_dir)] denoiser.generate_models(dataset) logger.info("Inline models generated") denoiser.train(dataset) logger.info("Classifier trained")
class TXTDenoiser(Command): """Command to clean TXT files """ def __init__(self, filename, logger, config): super(TXTDenoiser, self).__init__(filename, logger, config) self.denoiser = Denoiser(config) def execute(self): """Execute the command """ try: self.logger.debug("::: Text cleaning :::") super(TXTDenoiser, self).get_file() txt_dir = join(self.unzipped, "txt") txt_files = [join(txt_dir, f) for f in listdir(txt_dir) if isfile(join(txt_dir, f)) and f.endswith(".txt")] if len(txt_files) != 1: self.logger.error("Incorrect number of text files") self.finalize() return -1 text_data = self.denoiser.cleanse(txt_files[0], False) # Writing classified lines base_filename = splitext(basename(txt_files[0]))[0] clean_filename = join(txt_dir, base_filename+".clean.txt") garbage_filename = join(txt_dir, base_filename+".grbge.txt") unclassified_filename = join(txt_dir, base_filename+".unclss.txt") with codecs.open(clean_filename, "wb", encoding="utf-8") as clean_file: for line in text_data.get_clean_lines(): clean_file.write(line+"\n") with codecs.open(garbage_filename, "wb", encoding="utf-8") as garbage_file: for line in text_data.get_garbage_lines(): garbage_file.write(line+"\n") if len(text_data.get_unclassified_lines()) > 0: with codecs.open(unclassified_filename, "wb", encoding="utf-8") as unclassified_file: for line in text_data.get_unclassified_lines(): unclassified_file.write(line+"\n") except Exception, e: print e self.logger.error("Cleaner has stopped unexpectedly: "+e.message) self.finalize() return -2 self.finalize() return 0
def __init__(self, lang): tacotron2 = load_and_setup_model('Tacotron2', parser, args.tacotron2, args.amp_run, args.cpu_run, forward_is_infer=True) waveglow = load_and_setup_model('WaveGlow', parser, args.waveglow, args.amp_run, args.cpu_run, forward_is_infer=True) if args.cpu_run: denoiser = Denoiser(waveglow, args.cpu_run) else: denoiser = Denoiser(waveglow, args.cpu_run).cuda() jitted_tacotron2 = torch.jit.script(tacotron2) self.language = lang
def __init__(self): hparams = create_hparams() hparams.sampling_rate = 22050 checkpoint_path = constants.TACOTRON_PT self.model = load_model(hparams) self.model.load_state_dict(torch.load(checkpoint_path)['state_dict']) _ = self.model.cuda().eval().half() waveglow_path = constants.WAVEGLOW_PT self.waveglow = torch.load(waveglow_path)['model'] self.waveglow.cuda().eval().half() for k in self.waveglow.convinv: k.float() self.denoiser = Denoiser(self.waveglow)
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength): mel_files = files_to_list(mel_files) waveglow = torch.load(waveglow_path)['model'] for m in waveglow.modules(): if 'Conv' in str(type(m)): setattr(m, 'padding_mode', 'zeros') waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval() if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() for i, file_path in enumerate(mel_files): file_name = os.path.splitext(os.path.basename(file_path))[0] #print(file_name) mel = torch.load(file_path) # print("mel",mel) #print(mel.shape) mel = torch.autograd.Variable(mel.cuda()) # print("mel",mel) mel = torch.unsqueeze(mel, 0) mel = mel.half() if is_fp16 else mel # print("mel",mel) print(torch.min(mel),torch.max(mel)) with torch.no_grad(): audio = waveglow.infer(mel, sigma=sigma) if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) k.append(abs(audio).max().item()) #print(min(k),max(k)) #audio = audio*18000*abs(audio).max()/0.99 #print("audio",audio) #print((audio).min().item(),(audio).max().item()) audio = audio * MAX_WAV_VALUE audio = audio.squeeze() audio = audio.cpu().numpy() audio = audio.astype('int16') audio_path = os.path.join( output_dir, "{}_synthesis_sig0.7_d_0.1.wav".format(file_name)) write(audio_path, sampling_rate, audio) print(audio_path)
def main(text): hparams = create_hparams() hparams.sampling_rate = 22050 hparams.gate_threshold = 0.1 hparams.max_decoder_steps = 5000 # #### Load model from checkpoint checkpoint_path = "tacotron2_statedict.pt" model = load_model(hparams) model.load_state_dict(torch.load(checkpoint_path)['state_dict']) _ = model.cuda().eval().half() # #### Load WaveGlow for mel2audio synthesis and denoiser waveglow_path = 'waveglow_256channels.pt' waveglow = torch.load(waveglow_path)['model'] waveglow.cuda().eval().half() for m in waveglow.modules(): if 'Conv' in str(type(m)): setattr(m, 'padding_mode', 'zeros') for k in waveglow.convinv: k.float() denoiser = Denoiser(waveglow) # #### Prepare text input sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() # #### Decode text input and plot results mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) plot_data((mel_outputs.float().data.cpu().numpy()[0], mel_outputs_postnet.float().data.cpu().numpy()[0], alignments.float().data.cpu().numpy()[0].T)) # #### Synthesize audio from spectrogram using WaveGlow with torch.no_grad(): audio = waveglow.infer(mel_outputs_postnet, sigma=0.666) # #### (Optional) Remove WaveGlow bias audio_denoised = denoiser(audio, strength=0.01)[:, 0] # save if (os.path.isfile("out.wav")): x, sr = librosa.load("out.wav") out = np.append(x, audio[0].data.cpu().numpy().astype(np.float32)) else: out = audio[0].data.cpu().numpy().astype(np.float32) librosa.output.write_wav('./out.wav', out, 22050)
def init_model(): print("init model!!!!") global tacotron2_model global waveglow_model global denoiser tacotron2_path = "outdir_finetune/checkpoint_62500" # tacotron2_path = "outdir_korean/checkpoint_8800" # tacotron2_path = "../models/tacotron2/outdir_korean/checkpoint_25000" # tacotron2_path = "../tacotron2-pytorch/outdir/checkpoint_15000" # tacotron2_path = "../models/tacotron2/outdir_korean/checkpoint_15000" # tacotron2_path = "outdir_lj_korean/checkpoint_5000" # tacotron2_path = "outdir_longtrain/checkpoint_439500" waveglow_path = "../waveglow-fix/checkpoints_finetune/waveglow_478000" # waveglow_path = "../waveglow/checkpoints/waveglow_335000" # waveglow_path = "../waveglow-fix/checkpoints_longtrain/waveglow_484000" sampling_rate = 22050 denoiser_strength = 0.0 hparams = create_hparams() hparams.sampling_rate = sampling_rate hparams.training = False tacotron2_model = load_model(hparams) tacotron2_model.load_state_dict(torch.load(tacotron2_path)['state_dict']) _ = tacotron2_model.cuda().eval().half() # with open("waveglow/config.json") as f: # data = f.read() # import json # config = json.loads(data) # waveglow_config = config["waveglow_config"] # # waveglow_model = glow.WaveGlow(**waveglow_config) # # checkpoint_dict = torch.load(waveglow_path, map_location='cpu') # model_for_loading = checkpoint_dict['model'] # waveglow_model.load_state_dict(model_for_loading.state_dict()) # # # waveglow_model.load_state_dict(torch.load(waveglow_path)['state_dict']) # waveglow_model = waveglow_model.remove_weightnorm(waveglow_model) # waveglow_model.cuda().eval().half() waveglow_model = torch.load(waveglow_path)['model'] waveglow_model = waveglow_model.remove_weightnorm(waveglow_model) waveglow_model.cuda().eval().half() for k in waveglow_model.convinv: k.float() if denoiser_strength > 0: denoiser = Denoiser(waveglow_model)
def load_waveglow(self, vocoder_path, config_fpath): # Load config file with open(config_fpath) as f: data = f.read() config = json.loads(data) train_config = config["train_config"] data_config = config["data_config"] dist_config = config["dist_config"] vocoder_config = { **config["waveglow_config"], 'win_length': data_config['win_length'], 'hop_length': data_config['hop_length'] } print(vocoder_config) print(f"Config File from '{config_fpath}' successfully loaded.") # import the correct model core if self.is_ax(vocoder_config): from efficient_model_ax import WaveGlow else: if vocoder_config["yoyo"]: from efficient_model import WaveGlow else: from glow import WaveGlow # initialize model print(f"intializing WaveGlow model... ", end="") waveglow = WaveGlow(**vocoder_config).cuda() print(f"Done!") # load checkpoint from file print(f"loading WaveGlow checkpoint... ", end="") checkpoint = torch.load(vocoder_path) waveglow.load_state_dict( checkpoint['model'] ) # and overwrite initialized weights with checkpointed weights waveglow.cuda().eval().half( ) # move to GPU and convert to half precision print(f"Done!") print(f"initializing Denoiser... ", end="") denoiser = Denoiser(waveglow) print(f"Done!") vocoder_iters = checkpoint['iteration'] print(f"WaveGlow trained for {vocoder_iters} iterations") speaker_lookup = checkpoint['speaker_lookup'] # ids lookup training_sigma = train_config['sigma'] return waveglow, denoiser, training_sigma, speaker_lookup
def text2audio(waveglow_path, sigma, output_dir, sampling_rate, mel): waveglow = torch.load(waveglow_path)['model'] waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval() denoiser = Denoiser(waveglow).cuda() with torch.no_grad(): audio = waveglow.infer(mel.cuda(), sigma=sigma) # if denoiser_strength > 0: # audio = denoiser(audio, denoiser_strength) #audio = audio * MAX_WAV_VALUE audio = audio.squeeze() audio = audio.cpu().numpy() sf.write(os.path.join(output_dir, "pred2.wav"), audio, sampling_rate)
def tacotron2_init(self): self.plot_wav_data = False # set parameters self.hparams = create_hparams() self.hparams.sampling_rate = 22050 # load tacotron2 self.model = load_model(self.hparams) self.model.load_state_dict(torch.load(TACOTRON_CHECKPOINT_FILE)['state_dict']) _ = self.model.cuda().eval().half() # load waveglow self.waveglow = torch.load(WAVEGLOW_CHECKPOINT_FILE)['model'] self.waveglow.cuda().eval().half() for k in self.waveglow.convinv: k.float() self.denoiser = Denoiser(self.waveglow)
def main(text_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength): hparams = create_hparams() Taco2 = load_pretrained_taco('tacotron2.pt', hparams) testset = TextMelLoader(text_files, hparams) collate_fn = TextMelCollate() test_loader = DataLoader(testset, num_workers=0, shuffle=False, sampler=None, batch_size=1, pin_memory=False, drop_last=True, collate_fn=collate_fn) waveglow = torch.load(waveglow_path)['model'] # waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval() if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() for i, batch in enumerate(test_loader): text_padded, input_lengths, mel_padded, max_len, output_lengths = parse_batch( batch) enc_outputs, _ = Taco2( (text_padded, input_lengths, mel_padded, max_len, output_lengths)) # mel = torch.autograd.Variable(mel.cuda()) # mel = torch.unsqueeze(mel, 0) # mel = mel.half() if is_fp16 else mel with torch.no_grad(): mel = waveglow.infer(enc_outputs, input_lengths, sigma=sigma) '''if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) audio = audio * MAX_WAV_VALUE''' # audio = audio.squeeze() # mel = mel.cpu().numpy() # audio = audio.astype('int16') print(mel) mel = mel.squeeze() print(mel.size()) mel_path = os.path.join(output_dir, "{}_synthesis.pt".format(i)) torch.save(mel, mel_path) print(mel_path)
def main(mel_files, squeezewave_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength): tic_prepare= time.time() mel_files = files_to_list(mel_files) squeezewave = torch.load(squeezewave_path)['model'] squeezewave = squeezewave.remove_weightnorm(squeezewave) squeezewave.cuda().eval() if is_fp16: from apex import amp squeezewave, _ = amp.initialize(squeezewave, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(squeezewave).cuda() toc_prepare = time.time() dur_prepare = toc_prepare - tic_prepare print("prepare model {:3.2}sec".format(dur_prepare) ) for i, file_path in enumerate(mel_files): file_name = os.path.splitext(os.path.basename(file_path))[0] mel = torch.load(file_path) mel = torch.autograd.Variable(mel.cuda()) mel = torch.unsqueeze(mel, 0) mel = mel.half() if is_fp16 else mel tic=time.time() with torch.no_grad(): audio = squeezewave.infer(mel, sigma=sigma).float() if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) audio = audio * MAX_WAV_VALUE toc=time.time() dur = toc -tic audio = audio.squeeze() audio = audio.cpu().numpy() len_wav = len(audio) sec_wav = len_wav/sampling_rate samples_sec = len_wav / dur audio = audio.astype('int16') audio_path = os.path.join( output_dir, "{}_s{}.wav".format(file_name,sigma)) write(audio_path, sampling_rate, audio) print("{} it took {:4.3f}sec for {:4.3f}sec {:4.2f}K sample 22Khz Audio files : RTF {:4.3f} {:4.3f}X {:4.2f}Ksamples/sec " .format(audio_path, dur, sec_wav, len_wav/1000, dur/sec_wav, sec_wav/dur , samples_sec/1000 ) )
def main(mel_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength): mel_files = files_to_list(mel_files) #测试集mel谱list waveglow = torch.load(waveglow_path)['model'] #加载模型 waveglow = waveglow.remove_weightnorm(waveglow) #?移除权重归一化 waveglow.cuda().eval() #cuda()拷贝进gpu #?变成测试模式,dropout和BN在训练时和测不一样 #apex加速 if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") # denoiser_strength=0 if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() for i, file_path in enumerate(mel_files): #file_name-对应的wav file_name = os.path.splitext(os.path.basename(file_path))[0] #加载MFCC特征,80个滤波器 mel = torch.load(file_path) #mel={key:mel[key].cuda() for key in mel} #封装数据 mel = torch.autograd.Variable(mel.cuda()) #80,375 -> 1*80*375 mel = torch.unsqueeze(mel, 0) #变成fp16数据以便apex加速 mel = mel.half() if is_fp16 else mel #反向传播不会自动求导 with torch.no_grad(): #生成1*96000Tensor数据,x为原始音频,z为mel谱 audio = waveglow.infer(mel, sigma=sigma) if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) #为了转成wav? audio = audio * MAX_WAV_VALUE #变成1维数据 audio = audio.squeeze() #在cpu中转成numpy audio = audio.cpu().numpy() #改变类型 audio = audio.astype('int16') #生成数据存储位置 audio_path = os.path.join(output_dir, "{}_synthesis.wav".format(file_name)) write(audio_path, sampling_rate, audio) #写入音频 print(audio_path)
def inference_plc(mel, waveglow, sigma, is_fp16, denoiser_strength): if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() mel = torch.autograd.Variable(mel.cuda()) mel = mel.half() if is_fp16 else mel with torch.no_grad(): audio = waveglow.infer(mel, sigma=sigma) if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) audio = audio * MAX_WAV_VALUE audio = audio.squeeze() return audio
def __init__(self): for module_path in './waveglow/', './waveglow/tacotron2': if module_path not in sys.path: sys.path.insert(0, module_path) # Disable deprecation warnings import warnings warnings.simplefilter('ignore') self.waveglow = torch.load('waveglow_256channels_ljs_v2.pt')['model'] self.waveglow = self.waveglow.remove_weightnorm(self.waveglow) self.waveglow.cuda().eval() from denoiser import Denoiser self.denoiser = Denoiser(self.waveglow).cuda() # Re-enable warnings warnings.resetwarnings()
def main(tacotron2_path, waveglow_path, sigma, output_dir, sampling_rate, denoiser_strength, text, file_idx, inference_name, zip_file, hparams): hparams.sampling_rate = sampling_rate torch.manual_seed(hparams.seed) torch.cuda.manual_seed(hparams.seed) random.seed(hparams.seed) model = load_model(hparams) model.load_state_dict(torch.load(tacotron2_path)['state_dict']) _ = model.cuda().eval().half() waveglow = torch.load(waveglow_path)['model'] waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval().half() for k in waveglow.convinv: k.float() if denoiser_strength > 0: denoiser = Denoiser(waveglow) sequence = np.array(text_to_sequence( text, ['transliteration_cleaners']))[None, :] print(sequence) # sequence2 = np.array(text_to_sequence(text, ['korean_cleaners']))[None, :] # sequence3 = np.array(text_to_sequence(text, ['korean_cleaners']))[None, :] # print(np.array_equal(sequence, sequence2)) # print(np.array_equal(sequence, sequence3)) sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) mel_outputs, mel_outputs_postnet2, _, alignments = model.inference( sequence) MAX_WAV_VALUE = 32768.0 print(mel_outputs_postnet.cpu().data.numpy()[0][0][:30]) print(mel_outputs_postnet2.cpu().data.numpy()[0][0][:30]) if np.array_equal(mel_outputs_postnet.cpu().data.numpy(), mel_outputs_postnet2.cpu().data.numpy()): print("same!!") else: print("different!!")
def __init__(self, lang): self.language = lang self.hparams = create_hparams() self.hparams.sampling_rate = 22050 with open('config.json', 'r') as f: self.config = json.load(f) self.waveglow_path = self.config.get('model').get('waveglow') self.waveglow = torch.load(self.waveglow_path)['model'] self.waveglow.cuda().eval().half() for m in self.waveglow.modules(): if 'Conv' in str(type(m)): setattr(m, 'padding_mode', 'zeros') for k in self.waveglow.convinv: k.float() self.denoiser = Denoiser(self.waveglow) self.update_model(lang)
def __init__(self, filename, logger, config): super(TXTDenoiser, self).__init__(filename, logger, config) self.denoiser = Denoiser(config)