def init_model(): print("init model!!!!") global tacotron2_model global waveglow_model global denoiser tacotron2_path = "outdir_finetune/checkpoint_62500" # tacotron2_path = "outdir_korean/checkpoint_8800" # tacotron2_path = "../models/tacotron2/outdir_korean/checkpoint_25000" # tacotron2_path = "../tacotron2-pytorch/outdir/checkpoint_15000" # tacotron2_path = "../models/tacotron2/outdir_korean/checkpoint_15000" # tacotron2_path = "outdir_lj_korean/checkpoint_5000" # tacotron2_path = "outdir_longtrain/checkpoint_439500" waveglow_path = "../waveglow-fix/checkpoints_finetune/waveglow_478000" # waveglow_path = "../waveglow/checkpoints/waveglow_335000" # waveglow_path = "../waveglow-fix/checkpoints_longtrain/waveglow_484000" sampling_rate = 22050 denoiser_strength = 0.0 hparams = create_hparams() hparams.sampling_rate = sampling_rate hparams.training = False tacotron2_model = load_model(hparams) tacotron2_model.load_state_dict(torch.load(tacotron2_path)['state_dict']) _ = tacotron2_model.cuda().eval().half() # with open("waveglow/config.json") as f: # data = f.read() # import json # config = json.loads(data) # waveglow_config = config["waveglow_config"] # # waveglow_model = glow.WaveGlow(**waveglow_config) # # checkpoint_dict = torch.load(waveglow_path, map_location='cpu') # model_for_loading = checkpoint_dict['model'] # waveglow_model.load_state_dict(model_for_loading.state_dict()) # # # waveglow_model.load_state_dict(torch.load(waveglow_path)['state_dict']) # waveglow_model = waveglow_model.remove_weightnorm(waveglow_model) # waveglow_model.cuda().eval().half() waveglow_model = torch.load(waveglow_path)['model'] waveglow_model = waveglow_model.remove_weightnorm(waveglow_model) waveglow_model.cuda().eval().half() for k in waveglow_model.convinv: k.float() if denoiser_strength > 0: denoiser = Denoiser(waveglow_model)
def test_overlay_first_samples(): hparams = create_hparams() # test 100 random combinations for _ in range(100): hparams.batch_factor = random.randint(1, 32) hparams.horizon = random.randint(1, 10) subscale = Subscaler(hparams) batch_dim = random.randint(1, 2) lensrc = subscale.context_len * random.randint(1, 10) indeces = torch.arange(lensrc).repeat(batch_dim, 1) pos = random.randint(0, 10) run_overlay(subscale, indeces, pos)
def tacotron2_init(self): self.plot_wav_data = False # set parameters self.hparams = create_hparams() self.hparams.sampling_rate = 22050 # load tacotron2 self.model = load_model(self.hparams) self.model.load_state_dict(torch.load(TACOTRON_CHECKPOINT_FILE)['state_dict']) _ = self.model.cuda().eval().half() # load waveglow self.waveglow = torch.load(WAVEGLOW_CHECKPOINT_FILE)['model'] self.waveglow.cuda().eval().half() for k in self.waveglow.convinv: k.float() self.denoiser = Denoiser(self.waveglow)
def load_tts_vocoder_models(tacotron_checkpoint_path, waveglow_checkpoint_path): hparams = create_hparams() hparams.sampling_rate = 22050 model = load_model(hparams) model.load_state_dict(torch.load(tacotron_checkpoint_path)['state_dict']) _ = model.cuda().eval() waveglow = torch.load(waveglow_checkpoint_path)['model'] waveglow.cuda().eval() #for k in waveglow.convinv: # k.float() denoiser = Denoiser(waveglow) return model, waveglow, denoiser, hparams
def main(text_files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength): hparams = create_hparams() Taco2 = load_pretrained_taco('tacotron2.pt', hparams) testset = TextMelLoader(text_files, hparams) collate_fn = TextMelCollate() test_loader = DataLoader(testset, num_workers=0, shuffle=False, sampler=None, batch_size=1, pin_memory=False, drop_last=True, collate_fn=collate_fn) waveglow = torch.load(waveglow_path)['model'] # waveglow = waveglow.remove_weightnorm(waveglow) waveglow.cuda().eval() if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O3") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() for i, batch in enumerate(test_loader): text_padded, input_lengths, mel_padded, max_len, output_lengths = parse_batch( batch) enc_outputs, _ = Taco2( (text_padded, input_lengths, mel_padded, max_len, output_lengths)) # mel = torch.autograd.Variable(mel.cuda()) # mel = torch.unsqueeze(mel, 0) # mel = mel.half() if is_fp16 else mel with torch.no_grad(): mel = waveglow.infer(enc_outputs, input_lengths, sigma=sigma) '''if denoiser_strength > 0: audio = denoiser(audio, denoiser_strength) audio = audio * MAX_WAV_VALUE''' # audio = audio.squeeze() # mel = mel.cpu().numpy() # audio = audio.astype('int16') print(mel) mel = mel.squeeze() print(mel.size()) mel_path = os.path.join(output_dir, "{}_synthesis.pt".format(i)) torch.save(mel, mel_path) print(mel_path)
def test_stack_flatten_parity(): hparams = create_hparams() for _ in range(100): hparams.batch_factor = random.randint(1, 32) hparams.horizon = random.randint(1, 10) seq_len = random.randint(1, 10) n_channels = random.randint(1, 1000) subscale = Subscaler(hparams) batch_dim = random.randint(1, 16) tensor = torch.rand( [batch_dim, seq_len * subscale.context_len, n_channels]) permuted = subscale.stack_substensors(tensor) orig = subscale.flatten_subtensors(permuted) assert (torch.eq(tensor, orig).all())
def main(_): fh = FieldHandler(train_file_path=FLAGS.train_file_path, category_columns=FLAGS.category_columns, continuation_columns=FLAGS.continuation_columns) features, labels = transformation_data(file_path=FLAGS.train_file_path, field_hander=fh, label=FLAGS.label) # features, labels, files_dict = dataGenerate(FLAGS.train_file_path) hparams = create_hparams(fh.field_nums, fh.feature_nums) train_input_fn = create_train_input_fn(features, label=labels, batch_size=hparams.batch_size, num_epochs=hparams.epoches) if hparams.model == "fm": model_fn = create_model_fn(FM) elif hparams.model == "ffm": if hparams.use_deep: tf.logging.warning("\n\n>>>>>>>>>>> use ffm model, ignore --use_deep params. <<<<<<<<<<<<<<<\n") model_fn = create_model_fn(FFM) else: raise ValueError("model is ffm or fm.") estimator = tf.estimator.Estimator( model_fn = model_fn, model_dir=FLAGS.model_path, params=hparams, config=tf.estimator.RunConfig( tf_random_seed=hparams.seed, log_step_count_steps=500 ) ) show_dict = { "loss":"loss", "accuracy":"accuracy/value", "auc":"auc/value" } log_hook = tf.train.LoggingTensorHook(show_dict, every_n_iter=100) # estimator.train(input_fn=train_input_fn, hooks=[log_hook]) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, hooks=[log_hook]) eval_spec = tf.estimator.EvalSpec(input_fn=train_input_fn, ) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def main(): hparams = create_hparams() hparams.sampling_rate = 22050 model = get_Tacotron2(hparams) waveglow = get_WaveGlow() # text = "Waveglow is really awesome!" texts = [ "PRIH1NTIH0NG , IH0N TH AO1NLIY0 SEH1NS WIH1TH HHWIH1CH W AA1R AE1T PRIY0ZEH1NT KAH0NSER1ND , DIH1FER0Z FRAH1M MOW2ST IH1F NAA1T FRAH1M AH0L TH AA1RTS AE1ND KRAE1FTS REH2PRIH0ZEH1NTIH0D IH0N TH EH2KSAH0BIH1SHAH0N", "AE1ND DIH0TEY1LIH0NG PAH0LIY1S IH0N SAH0VIH1LYAH0N KLOW1DHZ TOW0 B SKAE1TER0D THRUW0AW1T TH SAY1ZAH0BAH0L KRAW1D .", "AY1 LAH1V YUW1 VEH1RIY0 MAH1CH", "SAY1AH0NTIH0STS AE1T TH SER1N LAE1BRAH0TAO2RIY0 SEY1 DHEY1 HHAE1V DIH0SKAH1VER0D AH0 NUW1 PAA1RTAH0KAH0L .", "PREH1ZIH0DAH0NT TRAH1MP MEH1T WIH1TH AH1DHER0 LIY1DER0Z AE1T TH GRUW1P AH1V TWEH1NTIY0 KAA1NFER0AH0NS .", "LEH1TS GOW1 AW2T TOW0 TH EH1RPAO2RT . TH PLEY1N LAE1NDAH0D TEH1N MIH1NAH0TS AH0GOW2 .", "IH0N BIY1IH0NG KAH0MPEH1RAH0TIH0VLIY0 MAA1DER0N .", "VIH1PKIH0D", "VIH1P KIH0D" ] if not os.path.exists("results"): os.mkdir("results") for text in texts: sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() mel_outputs, mel_outputs_postnet, _, alignments = model.inference( sequence) plot_data((mel_outputs.float().data.cpu().numpy()[0], mel_outputs_postnet.float().data.cpu().numpy()[0], alignments.float().data.cpu().numpy()[0].T), text[:10]) #print("mel_out:", mel_outputs) #print("mel_out_postnet:", mel_outputs_postnet) #print("alignments:", alignments) with torch.no_grad(): audio = waveglow.infer(mel_outputs_postnet, sigma=0.666) audio = audio * hparams.max_wav_value audio = audio.squeeze() audio = audio.cpu().numpy() audio = audio.astype('int16') write("results/{}_synthesis.wav".format(text), hparams.sampling_rate, audio) print("complete:", text)
def main(): parse = argparse.ArgumentParser() parse.add_argument( '-f', '--feature_file', type=str, default='E:/Research/Synthesis/BZNSYP/ttt/009915.feature.f32', help='features file to train') parse.add_argument( '-o', '--out_file', type=str, default='E:/Research/Synthesis/BZNSYP/ttt/009915.feature.f32.s16', help='features file to train') args = parse.parse_args() hparams = create_hparams() synthesis(args, hparams)
def load_latest_model_from(location): files = [location + "/" + f for f in os.listdir(location)] newest_file = max(files, key=os.path.getctime) print("load model " + newest_file) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') hparam = hparams.create_hparams() model = WaveNetModel(hparam, device).to(device) if torch.cuda.is_available(): states = torch.load(newest_file) else: states = torch.load(newest_file, map_location='cpu') model.load_state_dict(states['state_dict']) return model
def load_mel(path): hparams = create_hparams() stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) audio, sampling_rate = librosa.core.load(path, sr=hparams.sampling_rate) audio = torch.from_numpy(audio) if sampling_rate != hparams.sampling_rate: raise ValueError("{} SR doesn't match target {} SR".format( sampling_rate, stft.sampling_rate)) audio_norm = audio / hparams.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = stft.mel_spectrogram(audio_norm) melspec = melspec.cpu() return melspec
def __init__(self, ckpt, wglw, n_speakers=123): print("[Loading Model]") self.ckpt = ckpt self.hparams = create_hparams() self.hparams.n_speakers = n_speakers self.stft = TacotronSTFT(self.hparams.filter_length, self.hparams.hop_length, self.hparams.win_length, self.hparams.n_mel_channels, self.hparams.sampling_rate, self.hparams.mel_fmin, self.hparams.mel_fmax) self.mellotron = load_model(self.hparams).cuda().eval() self.waveglow = torch.load(wglw)['model'].cuda().eval() self.denoiser = Denoiser(self.waveglow).cuda().eval() self.arpabet_dict = cmudict.CMUDict('data/cmu_dictionary') self.mellotron.load_state_dict(torch.load(ckpt)['state_dict']) print('[Loaded Model]')
def main(unused_arg): model_fn = model.create_model_fn(hp.create_hparams()) estimator = tf.contrib.learn.Estimator(model_fn=model_fn, model_dir=FLAGS.model_dir, config=tf.contrib.learn.RunConfig()) input_fn = input.create_input_fn([TEST_FILE_PATH], tf.crontrib.learn.ModeKeys.EVAL, FLAGS.test_batch_size, 1) eval_metrics = metrics.create_evaluation_metrics() estimator.evaluate(input_fn=input_fn, batch_size=FLAGS.test_batch_size, metrics=eval_metrics, steps=None)
def main(text, checkpoint_path, path, name): #### Setup hparams hparams = create_hparams("distributed_run=False,mask_padding=False") hparams.filter_length = 1024 hparams.hop_length = 256 hparams.win_length = 1024 #### Load model from checkpoint model = get_model(hparams, checkpoint_path) #### Prepare text input sequence = get_input(get_pinyin(text)) #### inference mel_outputs, mel_outputs_postnet, _, alignments = model.inference( sequence, drop_prob=0.25) #### tacotron result taco_stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, sampling_rate=hparams.sampling_rate) mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, 60) write( os.path.join(path, name) + '_tacotron.wav', 16000, waveform[0].data.cpu().numpy()) #### transform tacotron mel to wavenet mel wavenet_mel = to_wavenet_mel(mel_outputs_postnet.data.cpu().numpy()[0].T) #### save np.save( os.path.join(path, name) + '_mel.npy', mel_outputs_postnet.data.cpu().numpy()[0]) np.save( os.path.join(path, name) + '_alig.npy', alignments.data.cpu().numpy()[0]) np.save(os.path.join(path, name) + '.npy', wavenet_mel)
def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): executor = ProcessPoolExecutor(max_workers=num_workers) futures = [] index = 1 tacotron_model = model(create_hparams(), './output/checkpoint_20500') with open(os.path.join(in_dir, 'metadata.csv'), encoding='utf-8') as f: for line in tqdm(f): parts = line.strip().split('|') wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0]) text = parts[2] futures.append( executor.submit( partial(_process_utterance, out_dir, index, wav_path, text, tacotron_model))) index += 1 return [future.result() for future in tqdm(futures)]
def __init__(self, lang): self.language = lang self.hparams = create_hparams() self.hparams.sampling_rate = 22050 with open('config.json', 'r') as f: self.config = json.load(f) self.waveglow_path = self.config.get('model').get('waveglow') self.waveglow = torch.load(self.waveglow_path)['model'] self.waveglow.cuda().eval().half() for m in self.waveglow.modules(): if 'Conv' in str(type(m)): setattr(m, 'padding_mode', 'zeros') for k in self.waveglow.convinv: k.float() self.denoiser = Denoiser(self.waveglow) self.update_model(lang)
def getAudio(text): __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) hparams = create_hparams() hparams.sampling_rate = 22050 waveglow_path = os.path.join(__location__, 'waveglow_256channel.pt') waveglow = torch.load(waveglow_path,map_location='cpu')['model'] waveglow.cpu().eval() for k in waveglow.convinv: k.float() #denoiser = Denoiser(waveglow) checkpoint_path = os.path.join(__location__, "checkpoint_9000") model = load_model(hparams) #print(model) state = torch.load(checkpoint_path,map_location='cpu')['state_dict'] #print(state) model.load_state_dict(state) _ = model.cpu().eval() #text = "Bộ Y tế chỉ đạo Viện Vệ sinh dịch tễ và các địa phương điều tra dịch tễ các trường hợp có kết quả xét nghiệm dương tính, xác minh người tiếp xúc gần với bệnh nhân dương tính, khoanh vùng xử lý ổ dịch và cách ly theo dõi sức khỏe những người tiếp xúc." text = TTSnorm(text) sequence = np.array(text_to_sequence(text, ['basic_cleaners']))[None, :] sequence = torch.autograd.Variable(torch.from_numpy(sequence)).long() mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) #plot_data((mel_outputs.float().data.cpu().numpy()[0], mel_outputs_postnet.float().data.cpu().numpy()[0], alignments.float().data.cpu().numpy()[0].T)) with torch.no_grad(): audio = waveglow.infer(mel_outputs_postnet, sigma=0.666) ######### #Thieu phan denoiser do phai chay tren gpu ######### text_hashed=abs(hash(text)) % (10 ** 8) sd.write("static/audio/"+str(text_hashed)+'.wav',audio[0].data.cpu().numpy(), 22050) return text
def __init__(self, model_choice): self.model_choice = model_choice self.hparams = create_hparams() self.hparams.sampling_rate = 22050 with open('config.json', 'r') as f: self.config = json.load(f) self.max_duration_s = self.config.get('max_duration_s') self.hparams.max_decoder_steps = int(86.0 * self.max_duration_s) self.waveglow = torch.load('models/waveglow', map_location=torch.device('cpu'))['model'] self.waveglow.eval() for m in self.waveglow.modules(): if 'Conv' in str(type(m)): setattr(m, 'padding_mode', 'zeros') for k in self.waveglow.convinv: k.float() #self.denoiser = Denoiser(self.waveglow) self.update_model(model_choice, self.max_duration_s)
def load(self, tacotron_model, waveglow_model): # setting self.project_name = 'tacotron2' sys.path.append(self.project_name) sys.path.append(join(self.project_name, 'waveglow/')) # initialize Tacotron2 self.hparams = create_hparams() self.hparams.sampling_rate = 22050 self.hparams.max_decoder_steps = 1000 self.hparams.fp16_run = True self.tacotron = Tacotron2(self.hparams) self.tacotron.load_state_dict(torch.load(tacotron_model)['state_dict']) _ = self.tacotron.cuda().eval() self.waveglow = torch.load(waveglow_model)['model'] self.waveglow = self.waveglow.remove_weightnorm(self.waveglow) _ = self.waveglow.cuda().eval() for k in self.waveglow.convinv: k.float()
def synth(models, text, out): hparams = create_hparams() checkpoint_path = models + '/tacotron2' model = load_model(hparams) model.load_state_dict(torch.load(checkpoint_path)['state_dict']) _ = model.eval() waveglow_path = models + '/waveglow' waveglow = torch.load(waveglow_path)['model'] waveglow.cuda() sequence = np.array(text_to_sequence(text, ['basic_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) with torch.no_grad(): audio = 32768.0 * waveglow.infer(mel_outputs_postnet, sigma=0.666)[0] audio = audio.cpu().numpy() audio = audio.astype('int16') write(out, 8000, audio)
def infer(checkpoint_path, griffin_iters, text, out_filename): hparams = create_hparams() hparams.sampling_rate = 22050 model = load_model(hparams) model.load_state_dict(torch.load(checkpoint_path)['state_dict']) _ = model.cuda().eval() #.half() sequence = np.array(text_to_sequence(text, ['chinese_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) taco_stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, sampling_rate=hparams.sampling_rate) mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling audio = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, griffin_iters) audio = audio.squeeze() audio = audio.cpu().numpy() #audio = audio.astype('int16') audio_path = os.path.join('samples', "{}_synthesis.wav".format(out_filename)) write(audio_path, hparams.sampling_rate, audio) print(audio_path) plot_alignment_to_numpy( alignments.squeeze().cpu().detach().numpy().T, os.path.join('samples', "{}_attention.png".format(out_filename)))
def main(argv): args = utils.parse_args("Train a transformer model") utils.redirect_log_to_file(args.model_dir) hparams = create_hparams(args.model_dir, args.configs, initialize=True) utils.check_git_hash(args.model_dir) # Prepare data data.load_vocab(hparams) train_input_fn = data.InputPipeline(None, None, hparams.record_train_file, tf.estimator.ModeKeys.TRAIN, hparams) eval_input_fn = data.InputPipeline(None, None, hparams.record_eval_file, tf.estimator.ModeKeys.EVAL, hparams) # Training log_samples_hook = tf.train.LoggingTensorHook( ['targets', 'predictions'], at_end=True, formatter=tensors_to_string(hparams)) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=hparams.train_steps) eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, steps=hparams.eval_steps, hooks=[log_samples_hook]) distribution = tf.contrib.distribute.MirroredStrategy() run_config = tf.estimator.RunConfig( model_dir=args.model_dir, train_distribute=distribution, save_summary_steps=hparams.save_summary_steps, save_checkpoints_steps=hparams.save_checkpoints_steps, keep_checkpoint_max=hparams.n_checkpoints) estimator = tf.estimator.Estimator(model_fn=model.build_model_fn(hparams), config=run_config, model_dir=args.model_dir) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def infer(checkpoint_path, waveglow_path, text, save_path): hparams = create_hparams() hparams.sampling_rate = 22050 # checkpoint_path = "tacotron2_statedict.pt" model = load_model(hparams) model.load_state_dict(torch.load(checkpoint_path)['state_dict']) _ = model.cuda().eval().half() # waveglow_path = 'waveglow_256channels.pt' waveglow = torch.load(waveglow_path)['model'] waveglow.cuda().eval().half() for k in waveglow.convinv: k.float() # denoiser = Denoiser(waveglow) # text = "Waveglow is really awesome!" sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) plot_data((mel_outputs.float().data.cpu().numpy()[0], mel_outputs_postnet.float().data.cpu().numpy()[0], alignments.float().data.cpu().numpy()[0].T), save_path=save_path) with torch.no_grad(): audio = waveglow.infer(mel_outputs_postnet.half(), sigma=0.666) # ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams.sampling_rate) audio = audio.cpu().numpy()[0] # normalize audio for now audio = audio / np.abs(audio).max() print(audio.shape) write(os.path.join(save_path, 'test{}.wav'.format(1)), hparams.sampling_rate, audio)
def reload_model(self): TTmodel_fpath = self.get_current_TTmodel_dir() WGmodel_fpath = self.get_current_WGmodel_dir() # Setup hparams self.hparams = create_hparams() self.hparams.sampling_rate = 22050 # Load Tacotron 2 from checkpoint self.model = load_model(self.hparams, self.use_cuda) device = torch.device('cuda' if self.use_cuda else 'cpu') self.model.load_state_dict( torch.load(TTmodel_fpath, map_location=device)['state_dict']) if self.use_cuda: _ = self.model.cuda().eval().half() else: _ = self.model.eval() # Load WaveGlow for mel2audio synthesis and denoiser self.waveglow = torch.load(WGmodel_fpath, map_location=device)['model'] self.waveglow.use_cuda = self.use_cuda if self.use_cuda: self.waveglow.cuda().eval().half() else: self.waveglow.eval() for k in self.waveglow.convinv: k.float()
type=int, default=0, required=False, help='rank of current gpu') parser.add_argument('--group_name', type=str, default='group_name', required=False, help='Distributed group name') parser.add_argument('--hparams', type=str, required=False, help='comma separated name=value pairs') args = parser.parse_args() hparams = create_hparams(args.hparams) torch.backends.cudnn.enabled = hparams.cudnn_enabled torch.backends.cudnn.benchmark = hparams.cudnn_benchmark print("FP16 Run:", hparams.fp16_run) print("Dynamic Loss Scaling:", hparams.dynamic_loss_scaling) print("Distributed Run:", hparams.distributed_run) print("cuDNN Enabled:", hparams.cudnn_enabled) print("cuDNN Benchmark:", hparams.cudnn_benchmark) print("Time warping: ", hparams.mel_time_warping) print("Freq warping: ", hparams.mel_freq_warping) # train(args.output_directory, args.log_directory, args.checkpoint_path, # args.warm_start, args.n_gpus, args.rank, args.group_name, hparams) train("./check_point", "./logs", None, args.warm_start, 4, args.rank,
from pathlib import Path from functools import partial from multiprocessing.pool import Pool from matplotlib import pyplot as plt from tqdm import tqdm import collections as clt import os import re import json import numpy as np import shutil from data_utils import TextMelLoader from hparams import create_hparams hp = create_hparams() metadata_path = None text_mel_loader = None output_dir = None def format_index(index): return '{:06d}'.format(index) def process_one(index, skip_existing=False): global text_mel_loader global metadata_path global output_dir if text_mel_loader is None:
def run(output_dir, ckpt_path): model = load_model(hparams) checkpoint_dict = torch.load(ckpt_path, map_location='cpu') model.load_state_dict(checkpoint_dict['state_dict']) train_loader, valset, collate_fn, train_sampler = prepare_dataloaders( hparams) model.eval() for batch in tqdm(train_loader): text, _, mel, _, _, _, fname = batch mel_pred, attn = model.inference((text.cuda(), mel.cuda())) output_fname = fname[0].replace('.wav', '-kkr2.mel') mel = mel_pred[0].data.cpu().numpy() np.save(output_fname, mel) if __name__ == '__main__': output_dir = 'data-bin/mel_train-clean-100' ckpt_path = 'models/gst_tacotron_baseline_pretrained/checkpoint_45000' hparams = create_hparams() hparams.batch_size = 1 run(output_dir, ckpt_path)
default="", required=False, help="gpu's indices for distributed run (separated by commas)") parser.add_argument("--gpu_idx", type=int, default=0, required=False, help="device index for the current run") parser.add_argument("--group_name", type=str, default="group_name", required=False, help="Distributed group name") args = parser.parse_args() hparams = create_hparams(args.hparams_path) hparams.path = args.hparams_path n_gpus = 0 rank = 0 if args.distributed_run: assert args.gpus_ranks gpus_ranks = { elem: i for i, elem in enumerate( int(elem) for elem in args.gpus_ranks.split(",")) } n_gpus = len(gpus_ranks) rank = gpus_ranks[args.gpu_idx]
tf.flags.DEFINE_string("test_file", "./data/test.tfrecords", "Path of test data in TFRecords format") tf.flags.DEFINE_string("model_dir", None, "Directory to load model checkpoints from") tf.flags.DEFINE_integer("loglevel", 20, "Tensorflow log level") tf.flags.DEFINE_integer("test_batch_size", 16, "Batch size for testing") FLAGS = tf.flags.FLAGS if not FLAGS.model_dir: print("You must specify a model directory") sys.exit(1) tf.logging.set_verbosity(FLAGS.loglevel) if __name__ == "__main__": hparams = hparams.create_hparams() model_fn = model.create_model_fn(hparams, model_impl=dual_encoder_model) estimator = tf.contrib.learn.Estimator(model_fn=model_fn, model_dir=FLAGS.model_dir, config=tf.contrib.learn.RunConfig()) input_fn_test = inputs.create_input_fn(mode=tf.contrib.learn.ModeKeys.EVAL, input_files=[FLAGS.test_file], batch_size=FLAGS.test_batch_size, num_epochs=1) eval_metrics = metrics.create_evaluation_metrics() estimator.evaluate(input_fn=input_fn_test, steps=None, metrics=eval_metrics)
def main(): tf.logging.set_verbosity(tf.logging.INFO) hparams = create_hparams() for path in [args.train_log_dir]: if not tf.gfile.Exists(path): tf.gfile.MakeDirs(path) hparams_filename = os.path.join(args.train_log_dir, 'hparams.json') with tf.gfile.FastGFile(hparams_filename, 'w') as f: f.write(hparams.to_json()) with tf.Graph().as_default(): with tf.device(tf.train.replica_device_setter(args.task_id)): global_step = tf.train.get_or_create_global_step() colors, depths, labels, label_augs = get_dataset( args.dataset_dir, args.num_readers, args.num_preprocessing_threads, hparams) net, end_points = model(colors, depths, num_classes=3, num_channels=1000, is_training=True, global_pool=False, output_stride=16, spatial_squeeze=False, color_scope='color_tower', depth_scope='depth_tower', scope='arcnet') loss = create_loss(net, labels, hparams.lamb) # loss = create_loss_without_background(net, labels) learning_rate = hparams.learning_rate if hparams.lr_decay_step: learning_rate = tf.train.exponential_decay( hparams.learning_rate, tf.train.get_or_create_global_step(), decay_steps=hparams.lr_decay_step, decay_rate=hparams.lr_decay_rate, staircase=True) tf.summary.scalar('Learning_rate', learning_rate) optimizer = tf.train.GradientDescentOptimizer(learning_rate) train_op = slim.learning.create_train_op(loss, optimizer) add_summary(colors, depths, labels, end_points, loss, scope='arcnet') summary_op = tf.summary.merge_all() if not args.from_arcnet_checkpoint: color_variable_map, depth_variable_map = restore_from_classification_checkpoint( color_scope='color_tower', depth_scope='depth_tower', model_name=hparams.model_name, checkpoint_exclude_scopes=['arcnet']) color_saver = tf.train.Saver(color_variable_map) depth_saver = tf.train.Saver(depth_variable_map) def initializer_fn(sess): color_saver.restore( sess, os.path.join(args.checkpoint_dir, hparams.model_name + '.ckpt')) depth_saver.restore( sess, os.path.join(args.checkpoint_dir, hparams.model_name + '.ckpt')) tf.logging.info('Successfully load pretrained checkpoint.') init_fn = initializer_fn else: variable_map = restore_map() init_saver = tf.train.Saver(variable_map) def initializer_fn(sess): init_saver.restore( sess, tf.train.latest_checkpoint(args.checkpoint_dir)) tf.logging.info('Successfully load pretrained checkpoint.') init_fn = initializer_fn session_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False) session_config.gpu_options.allow_growth = True saver = tf.train.Saver( keep_checkpoint_every_n_hours=args.save_interval_secs, max_to_keep=100) slim.learning.train(train_op, logdir=args.train_log_dir, master=args.master, global_step=global_step, session_config=session_config, init_fn=init_fn, summary_op=summary_op, number_of_steps=args.num_steps, startup_delay_steps=15, save_summaries_secs=args.save_summaries_steps, saver=saver)