def save_checkpoint(model, optimizer, learning_rate, iteration, filepath): print("Saving model and optimizer state at iteration {} to {}".format( iteration, filepath)) model_for_saving = WaveGlow(**waveglow_config).cpu() model_for_saving.load_state_dict(model.state_dict()) torch.save({'model': model_for_saving, 'iteration': iteration, 'optimizer': optimizer.state_dict(), 'learning_rate': learning_rate}, filepath)
def save_checkpoint(model, optimizer, epoch, filepath): print(f'Saving model and optimizer state at epoch {epoch} to {filepath}') model_for_saving = WaveGlow(**waveglow_config).cuda() model_for_saving.load_state_dict(model.state_dict()) torch.save( { 'model': model_for_saving, 'epoch': epoch, 'optimizer': optimizer.state_dict() }, filepath)
def save_checkpoint(model, optimizer, learning_rate, iteration, filepath): print("Saving model and optimizer state at iteration {} to {}".format( iteration, filepath)) model_for_saving = WaveGlow(**waveglow_config).cuda() model_for_saving.load_state_dict(model.state_dict()) torch.save( { "model": model_for_saving, "iteration": iteration, "optimizer": optimizer.state_dict(), "learning_rate": learning_rate, }, filepath, )
def save_checkpoint(model, optimizer, amp, iteration, filepath): print("Saving model and optimizer state at iteration {} to {}".format( iteration, filepath)) model_for_saving = WaveGlow(**waveglow_config).cuda() model_for_saving.load_state_dict(model.state_dict()) checkpoint = { 'model': model_for_saving, 'iteration': iteration, 'optimizer': optimizer.state_dict(), 'cuda_rng_state_all': torch.cuda.get_rng_state_all(), 'random_rng_state': torch.random.get_rng_state() } if amp is not None: checkpoint['amp'] = amp.state_dict() torch.save(checkpoint, filepath)
os.mkdir("results") audio.save_wav(wav[0].data.cpu().numpy(), os.path.join("results", str(num) + ".wav")) if __name__ == "__main__": # Test device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') torch.manual_seed(hp.seed) torch.cuda.manual_seed(hp.seed) model = WaveGlow().cuda() checkpoint = torch.load('test/TTSglow_130000') model.load_state_dict(checkpoint['model'].state_dict()) dataset = FastSpeechDataset() testing_loader = DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=collate_fn, drop_last=True, num_workers=4) model = model.eval() for i, data_of_batch in enumerate(testing_loader): src_seq = data_of_batch["texts"] src_pos = data_of_batch["pos"] src_seq = torch.from_numpy(src_seq).long().to(device)
def main(files, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength, args): #mel_files = files_to_list(mel_files) #print(mel_files) files = ['/local-scratch/fuyang/cmpt726/final_project/cremad/1091_WSI_SAD_XX.wav'] #files = ['/local-scratch/fuyang/cmpt726/waveglow/data/LJSpeech-1.1/LJ001-0001.wav'] with open('config.json') as f: data = f.read() config = json.loads(data) waveglow_config = config["waveglow_config"] model = WaveGlow(**waveglow_config) checkpoint_dict = torch.load('waveglow_256channels_universal_v5.pt', map_location='cpu') model_for_loading = checkpoint_dict['model'] model.load_state_dict(model_for_loading.state_dict()) model.cuda() #waveglow = torch.load(waveglow_path)['model'] #waveglow = waveglow.remove_weightnorm(waveglow) #waveglow.cuda() waveglow = model if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O1") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() mel_extractor = Get_mel(1024, 256, 1024, args.sampling_rate, 0.0, 8000.0) for i, file_path in enumerate(files): audio, rate = load_wav_to_torch(file_path) if rate != sampling_rate: audio = resampy.resample(audio.numpy(), rate, sampling_rate) audio = torch.from_numpy(audio).float() #if audio.size(0) >= args.segment_length: # max_audio_start = audio.size(0) - args.segment_length # audio_start = random.randint(0, max_audio_start) # audio = audio[audio_start:audio_start+args.segment_length] #else: # audio = torch.nn.functional.pad(audio, (0, args.segment_length-audio.size(0)), 'constant').data mel = mel_extractor.get_mel(audio) audio = audio / MAX_WAV_VALUE mel = torch.autograd.Variable(mel.cuda().unsqueeze(0)) audio = torch.autograd.Variable(audio.cuda().unsqueeze(0)) audio = audio.half() if is_fp16 else audio mel = mel.half() if is_fp16 else mel outputs = waveglow((mel, audio)) z = outputs[0][:,4:] print(outputs) mel_up = waveglow.upsample(mel) time_cutoff = waveglow.upsample.kernel_size[0]-waveglow.upsample.stride[0] mel_up = mel_up[:,:,:-time_cutoff] #mel_up = mel_up[:,:,:-(time_cutoff+128)] mel_up = mel_up.unfold(2, waveglow.n_group, waveglow.n_group).permute(0,2,1,3) mel_up = mel_up.contiguous().view(mel_up.size(0), mel_up.size(1), -1).permute(0, 2, 1) audio = z mel_up = mel_up[:,:,:audio.size(2)] sigma = 0.7 z_i = 0 for k in reversed(range(waveglow.n_flows)): n_half = int(audio.size(1)/2) audio_0 = audio[:,:n_half, :] audio_1 = audio[:, n_half:, :] output = waveglow.WN[k]((audio_0, mel_up)) s = output[:,n_half:, :] b = output[:, :n_half, :] audio_1 = (audio_1-b)/torch.exp(s) audio = torch.cat([audio_0, audio_1],1) audio = waveglow.convinv[k](audio, reverse=True) if k % waveglow.n_early_every == 0 and k > 0: z = outputs[0][:, 2-z_i:4-z_i] #if mel_up.type() == 'torch.cuda.HalfTensor': # z = torch.cuda.HalfTensor(mel_up.size(0), waveglow.n_early_size, mel_up.size(2)).normal_() #else: # z = torch.cuda.FloatTensor(mel_up.size(0), waveglow.n_early_size, mel_up.size(2)).normal_() audio = torch.cat((sigma*z, audio),1) audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data audio = audio * MAX_WAV_VALUE audio = audio.squeeze() audio = audio.cpu().numpy() audio = audio.astype('int16') audio_path = os.path.join( output_dir, "{}_synthesis.wav".format('fuyangz')) write(audio_path, sampling_rate, audio) print(audio_path)
def main(style, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength, args): #mel_files = files_to_list(mel_files) #print(mel_files) dataset = voice_dataset(dataBase={ 'ravdess': './our_data/ravdess', 'cremad': './our_data/cremad' }, style=('happy', 'sad', 'angry')) #print(len(dataset.final_data['happy'])) #sample = dataset.pick_one_random_sample('happy') files = dataset.final_data[style] #files = ['/local-scratch/fuyang/cmpt726/waveglow/data/LJSpeech-1.1/LJ001-0001.wav'] with open('config.json') as f: data = f.read() config = json.loads(data) waveglow_config = config["waveglow_config"] model = WaveGlow(**waveglow_config) checkpoint_dict = torch.load('waveglow_256channels_universal_v5.pt', map_location='cpu') model_for_loading = checkpoint_dict['model'] model.load_state_dict(model_for_loading.state_dict()) model.cuda() waveglow = model if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O1") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() mel_extractor = Get_mel(1024, 256, 1024, args.sampling_rate, 0.0, 8000.0) avg_z = np.zeros(8) _count = 0 for i, (_, file_path) in enumerate(files): if i > 50: break try: audio, rate = load_wav_to_torch(file_path) if rate != sampling_rate: audio = resampy.resample(audio.numpy(), rate, sampling_rate) audio = torch.from_numpy(audio).float() #if audio.size(0) >= args.segment_length: # max_audio_start = audio.size(0) - args.segment_length # audio_start = random.randint(0, max_audio_start) # audio = audio[audio_start:audio_start+args.segment_length] #else: # audio = torch.nn.functional.pad(audio, (0, args.segment_length-audio.size(0)), 'constant').data mel = mel_extractor.get_mel(audio) audio = audio / MAX_WAV_VALUE mel = torch.autograd.Variable(mel.cuda().unsqueeze(0)) audio = torch.autograd.Variable(audio.cuda().unsqueeze(0)) audio = audio.half() if is_fp16 else audio mel = mel.half() if is_fp16 else mel outputs = waveglow((mel, audio)) avg_z += outputs[0].squeeze(0).mean(1).detach().cpu().numpy() _count += 1 z = outputs[0][:, 4:] #print(outputs) mel_up = waveglow.upsample(mel) time_cutoff = waveglow.upsample.kernel_size[ 0] - waveglow.upsample.stride[0] mel_up = mel_up[:, :, :-time_cutoff] #mel_up = mel_up[:,:,:-(time_cutoff+128)] mel_up = mel_up.unfold(2, waveglow.n_group, waveglow.n_group).permute(0, 2, 1, 3) mel_up = mel_up.contiguous().view(mel_up.size(0), mel_up.size(1), -1).permute(0, 2, 1) audio = z mel_up = mel_up[:, :, :audio.size(2)] sigma = 0.7 z_i = 0 for k in reversed(range(waveglow.n_flows)): n_half = int(audio.size(1) / 2) audio_0 = audio[:, :n_half, :] audio_1 = audio[:, n_half:, :] output = waveglow.WN[k]((audio_0, mel_up)) s = output[:, n_half:, :] b = output[:, :n_half, :] audio_1 = (audio_1 - b) / torch.exp(s) audio = torch.cat([audio_0, audio_1], 1) audio = waveglow.convinv[k](audio, reverse=True) if k % waveglow.n_early_every == 0 and k > 0: z = outputs[0][:, 2 - z_i:4 - z_i] #if mel_up.type() == 'torch.cuda.HalfTensor': # z = torch.cuda.HalfTensor(mel_up.size(0), waveglow.n_early_size, mel_up.size(2)).normal_() #else: # z = torch.cuda.FloatTensor(mel_up.size(0), waveglow.n_early_size, mel_up.size(2)).normal_() audio = torch.cat((sigma * z, audio), 1) audio = audio.permute(0, 2, 1).contiguous().view(audio.size(0), -1).data audio = audio * MAX_WAV_VALUE audio = audio.squeeze() audio = audio.cpu().numpy() audio = audio.astype('int16') audio_path = os.path.join( output_dir, "{}_synthesis.wav".format(file_path[:-4])) if os.path.exists( os.path.join(*audio_path.split('/')[:-1])) is False: os.makedirs(os.path.join(*audio_path.split('/')[:-1]), exist_ok=True) write(audio_path, sampling_rate, audio) print(audio_path) except: continue avg_z = avg_z / _count np.save(style, avg_z)
class TTSModel(object): """docstring for TTSModel.""" def __init__(self, tacotron2_path, waveglow_path, **kwargs): super(TTSModel, self).__init__() hparams = HParams(**kwargs) self.hparams = hparams self.model = Tacotron2(hparams) if torch.cuda.is_available(): self.model.load_state_dict( torch.load(tacotron2_path)["state_dict"]) self.model.cuda().eval() else: self.model.load_state_dict( torch.load(tacotron2_path, map_location="cpu")["state_dict"]) self.model.eval() self.k_cache = klepto.archives.file_archive(cached=False) if waveglow_path: if torch.cuda.is_available(): wave_params = torch.load(waveglow_path) else: wave_params = torch.load(waveglow_path, map_location="cpu") try: self.waveglow = WaveGlow(**WAVEGLOW_CONFIG) self.waveglow.load_state_dict(wave_params) except: self.waveglow = wave_params["model"] self.waveglow = self.waveglow.remove_weightnorm(self.waveglow) if torch.cuda.is_available(): self.waveglow.cuda().eval() else: self.waveglow.eval() # workaround from # https://github.com/NVIDIA/waveglow/issues/127 for m in self.waveglow.modules(): if "Conv" in str(type(m)): setattr(m, "padding_mode", "zeros") for k in self.waveglow.convinv: k.float().half() self.denoiser = Denoiser(self.waveglow, n_mel_channels=hparams.n_mel_channels) self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)( self._synth_speech) else: self.synth_speech = klepto.safe.inf_cache(cache=self.k_cache)( self._synth_speech_fast) self.taco_stft = TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, n_mel_channels=hparams.n_mel_channels, sampling_rate=hparams.sampling_rate, mel_fmax=4000, ) def _generate_mel_postnet(self, text): sequence = np.array(text_to_sequence(text, ["english_cleaners"]))[None, :] if torch.cuda.is_available(): sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() else: sequence = torch.autograd.Variable( torch.from_numpy(sequence)).long() with torch.no_grad(): mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference( sequence) return mel_outputs_postnet def synth_speech_array(self, text, vocoder): mel_outputs_postnet = self._generate_mel_postnet(text) if vocoder == VOCODER_WAVEGLOW: with torch.no_grad(): audio_t = self.waveglow.infer(mel_outputs_postnet, sigma=0.666) audio_t = self.denoiser(audio_t, 0.1)[0] audio = audio_t[0].data elif vocoder == VOCODER_GL: mel_decompress = self.taco_stft.spectral_de_normalize( mel_outputs_postnet) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], self.taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling spec_from_mel = (spec_from_mel.cuda() if torch.cuda.is_available() else spec_from_mel) audio = griffin_lim( torch.autograd.Variable(spec_from_mel[:, :, :-1]), self.taco_stft.stft_fn, GL_ITERS, ) audio = audio.squeeze() else: raise ValueError("vocoder arg should be one of [wavglow|gl]") audio = audio.cpu().numpy() return audio def _synth_speech(self, text, speed: float = 1.0, sample_rate: int = OUTPUT_SAMPLE_RATE): audio = self.synth_speech_array(text, VOCODER_WAVEGLOW) return postprocess_audio( audio, src_rate=self.hparams.sampling_rate, dst_rate=sample_rate, tempo=speed, ) def _synth_speech_fast(self, text, speed: float = 1.0, sample_rate: int = OUTPUT_SAMPLE_RATE): audio = self.synth_speech_array(text, VOCODER_GL) return postprocess_audio( audio, tempo=speed, src_rate=self.hparams.sampling_rate, dst_rate=sample_rate, )
def train(num_gpus, rank, group_name, output_directory, epochs, learning_rate, sigma, iters_per_checkpoint, batch_size, seed, fp16_run, checkpoint_path, with_tensorboard): torch.manual_seed(seed) torch.cuda.manual_seed(seed) #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: init_distributed(rank, num_gpus, group_name, **dist_config) #=====END: ADDED FOR DISTRIBUTED====== criterion = WaveGlowLoss(sigma) model = WaveGlow(**waveglow_config).cuda() #=====START: ADDED FOR DISTRIBUTED====== if num_gpus > 1: model = apply_gradient_allreduce(model) #=====END: ADDED FOR DISTRIBUTED====== optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) if fp16_run: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level='O1') # Load checkpoint if one exists iteration = 0 if checkpoint_path == "waveglow_256channels_.pt": checkpoint_dict = torch.load(checkpoint_path) model.load_state_dict(checkpoint_dict['model'].state_dict()) iteration += 1 # next iteration is iteration + 1 elif checkpoint_path != "": model, optimizer, iteration = load_checkpoint(checkpoint_path, model, optimizer) iteration += 1 # next iteration is iteration + 1 trainset = Mel2Samp(**data_config) # =====START: ADDED FOR DISTRIBUTED====== train_sampler = DistributedSampler(trainset) if num_gpus > 1 else None # =====END: ADDED FOR DISTRIBUTED====== train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=batch_size, pin_memory=False, drop_last=True) # Get shared output_directory ready if rank == 0: if not os.path.isdir(output_directory): os.makedirs(output_directory) os.chmod(output_directory, 0o775) print("output directory", output_directory) if with_tensorboard and rank == 0: from tensorboardX import SummaryWriter logger = SummaryWriter(os.path.join(output_directory, 'logs')) model.train() epoch_offset = max(0, int(iteration / len(train_loader))) # ================ MAIN TRAINNIG LOOP! =================== for epoch in range(epoch_offset, epochs): print("Epoch: {}".format(epoch)) for i, batch in enumerate(train_loader): model.zero_grad() mel, audio = batch mel = torch.autograd.Variable(mel.cuda()) audio = torch.autograd.Variable(audio.cuda()) outputs = model((mel, audio)) loss = criterion(outputs) if num_gpus > 1: reduced_loss = reduce_tensor(loss.data, num_gpus).item() else: reduced_loss = loss.item() if fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() optimizer.step() print("{}:\t{:.9f}".format(iteration, reduced_loss)) if with_tensorboard and rank == 0: logger.add_scalar('training_loss', reduced_loss, i + len(train_loader) * epoch) if (iteration % iters_per_checkpoint == 0): if rank == 0: checkpoint_path = "{}/waveglow_{}".format( output_directory, iteration) save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path) iteration += 1
def main(style, waveglow_path, sigma, output_dir, sampling_rate, is_fp16, denoiser_strength, args): #mel_files = files_to_list(mel_files) #print(mel_files) dataset = voice_dataset(dataBase={ 'ravdess': './our_data/ravdess', 'cremad': './our_data/cremad' }, style=('happy', 'sad', 'angry')) #print(len(dataset.final_data['happy'])) #sample = dataset.pick_one_random_sample('happy') styles = ['happy', 'sad', 'angry'] with open('config.json') as f: data = f.read() config = json.loads(data) waveglow_config = config["waveglow_config"] model = WaveGlow(**waveglow_config) checkpoint_dict = torch.load('waveglow_256channels_universal_v5.pt', map_location='cpu') model_for_loading = checkpoint_dict['model'] model.load_state_dict(model_for_loading.state_dict()) model.cuda() waveglow = model if is_fp16: from apex import amp waveglow, _ = amp.initialize(waveglow, [], opt_level="O1") if denoiser_strength > 0: denoiser = Denoiser(waveglow).cuda() mel_extractor = Get_mel(1024, 256, 1024, args.sampling_rate, 0.0, 8000.0) vector_all = {} for style in styles: files = dataset.final_data[style].copy() random.shuffle(files) vectors = [] for i, (_, file_path) in enumerate(files): if i > 200: break try: audio, rate = load_wav_to_torch(file_path) if rate != sampling_rate: audio = resampy.resample(audio.numpy(), rate, sampling_rate) audio = torch.from_numpy(audio).float() #if audio.size(0) >= args.segment_length: # max_audio_start = audio.size(0) - args.segment_length # audio_start = random.randint(0, max_audio_start) # audio = audio[audio_start:audio_start+args.segment_length] #else: # audio = torch.nn.functional.pad(audio, (0, args.segment_length-audio.size(0)), 'constant').data mel = mel_extractor.get_mel(audio) audio = audio / MAX_WAV_VALUE mel = torch.autograd.Variable(mel.cuda().unsqueeze(0)) audio = torch.autograd.Variable(audio.cuda().unsqueeze(0)) audio = audio.half() if is_fp16 else audio mel = mel.half() if is_fp16 else mel outputs = waveglow((mel, audio)) vectors.append( outputs[0].squeeze(0).mean(1).detach().cpu().numpy()) print(style, i) except: continue vector_all[style] = vectors np.save('all_style_vector', vector_all)
def save_checkpoint(model, optimizer, learning_rate, iteration, filepath, drive_fid): print("Saving model and optimizer state at iteration {} to {}".format( iteration, filepath)) model_for_saving = WaveGlow(**waveglow_config).cuda() model_for_saving.load_state_dict(model.state_dict()) torch.save( { 'model': model_for_saving, 'iteration': iteration, 'optimizer': optimizer.state_dict(), 'learning_rate': learning_rate }, filepath) uploaded = False attempt = 0 file_title = filepath[filepath.find("/") + 1:] while not uploaded and attempt < 10: attempt += 1 try: if gauth.credentials is None: # Authenticate if they're not there gauth.LocalWebserverAuth() elif gauth.access_token_expired: # Refresh them if expired print("Google Drive Token Expired, Refreshing") gauth.Refresh() else: # Initialize the saved creds gauth.Authorize() # Save the current credentials to a file # gauth.SaveCredentialsFile("GoogleDriveCredentials.txt") f = drive.CreateFile({ 'title': file_title, "parents": [{ "kind": "drive#fileLink", "id": drive_fid }] }) f.SetContentFile(filepath) f.Upload() uploaded = True break except: print("Failed uploading to drive at attempt #{}".format(attempt)) sleep(30) if uploaded: try: ok = False for file in drive.ListFile({ 'q': "'" + drive_fid + "' in parents" }).GetList(): if file['title'] == file_title: if file["fileSize"] > 4000000: ok = True print("File was successfully uploaded") else: file.Delete() uploaded = False print("File was not uploaded normally. Deleting") sleep(30) break if ok: for file in drive.ListFile({ 'q': "'" + drive_fid + "' in parents" }).GetList(): if file['title'] != file_title: file.Delete() sleep( 30 ) #make sure the file is deleted from drive first except: pass