def main(): english_loader = DataLoader('data/sentences/', 25, (32, 256)) german_loader = DataLoader('data/GT4HistOCR/', 25, (32, 256)) english_loader2 = DataLoader('data/english_generated/', 25, (32, 256)) german_loader2 = DataLoader('data/something/', 25, (32, 256)) model = Model() # plt.imshow(english_loader.getNext()[0]) # plt.show() # plt.imshow(german_loader.getNext()[0]) # plt.show() train(model, english_loader, german_loader, english_loader2, german_loader2) print( test(model, english_loader, german_loader, english_loader2, german_loader2))
def get_vhrd_embeddings(self, data_fname, mode, save_name, use_saved_embeddings): if (use_saved_embeddings) and (os.path.exists(save_name)): with open(save_name, 'rb') as handle: data = cPickle.load(handle) else: data_loader = DataLoader(data_fname, mode) data = data_loader.load_data() assert not self.pretrainer is None data = self.pretrainer.get_embeddings(data) with open(save_name, 'wb') as handle: cPickle.dump(data, handle) x,y = self._build_data(data) return x, y
def init_classes(): trainFile = gol.get_val("trainFile") validationFile = gol.get_val("validationFile") testFile = gol.get_val("testFile") classes = DataLoader.loadClasses(trainFile, validationFile, testFile) # a list gol.set_val("classes", classes)
def main(): dataset = get_post_dataset() global_step = 0 m = nn.DataParallel(ModelPostNet().cuda()) m.train() optimizer = t.optim.Adam(m.parameters(), lr=hp.lr) writer = SummaryWriter() for epoch in range(hp.epochs): dataloader = DataLoader(dataset, batch_size=hp.batch_size, shuffle=True, collate_fn=collate_fn_postnet, drop_last=True, num_workers=0) pbar = tqdm(dataloader) for i, data in enumerate(pbar): pbar.set_description("Processing at epoch %d" % epoch) global_step += 1 if global_step < 400000: adjust_learning_rate(optimizer, global_step) mel, mag = data mel = mel.cuda() mag = mag.cuda() mag_pred = m.forward(mel) loss = nn.MSELoss()(mag_pred, mag) if global_step % 10 == 0: print('total_loss==', loss.item()) writer.add_scalars('training_loss', { 'loss': loss, }, global_step) optimizer.zero_grad() # Calculate gradients loss.backward() nn.utils.clip_grad_norm_(m.parameters(), 1.) # Update weights optimizer.step() if global_step % hp.save_step == 0: t.save( { 'model': m.state_dict(), 'optimizer': optimizer.state_dict() }, os.path.join(hp.checkpoint_path, 'checkpoint_postnet_%d.pth.tar' % global_step))
def main(): dataset = get_dataset() modelo.train() writer = SummaryWriter("runs/tranformer") estep = 0 for epoch in range(NUM_EPOCHS): dataloader = DataLoader(dataset, batch_size=hp.batch_size, collate_fn=collate_fn_transformer, drop_last=True, shuffle=True) pbar = tqdm(dataloader) losses = 0 for i, data in enumerate(pbar): estep = estep + 1 pbar.set_description("Processing at epoch %d" % epoch) character, mel_input, pos_text, pos_mel, _ = data character = character.to(DEVICE) mel_input = mel_input.to(DEVICE) pos_text = pos_text.to(DEVICE) pos_mel = pos_mel.to(DEVICE) output = modelo(character, mel_input, pos_text, pos_mel) # print(output) if estep == 1: writer.add_graph( modelo, input_to_model=[character, mel_input, pos_text, pos_mel]) # print("output modelo...."+str(output.shape)) # print("output trasformado..."+str(output.reshape(-1, output.shape[-1]).shape)) # print("caracter ......"+str(character.reshape(-1).shape)) optimizer.zero_grad() loss = loss_fn(output.reshape(-1, output.shape[-1]), character.reshape(-1)) output = output.transpose(0, 1) loss2 = loss.item() writer.add_scalar("loss :", loss2, estep) # print("/////////////////") # print(np.argmax(output[0].detach().numpy(),axis=1)) print("loss..........." + str(loss2)) # print("Epoch.........."+str(epoch)) loss.backward() optimizer.step() losses += loss.item() writer.add_scalar("loss2 :", losses, epoch) if epoch + 1 % hp.save_step == 0: t.save( { 'model': modelo.state_dict(), 'optimizer': optimizer.state_dict() }, os.path.join(hp.checkpoint_path, 'checkpoint_transformer_%d.pth.tar' % epoch)) writer.close()
def train(): """Start the training procedure """ num_epochs = 1 learning_rate = 0.05 batch_size = 8 data_loader = DataLoader(os.path.join("data", "fetal_health.csv")) data_loader.standardize_column("baseline value") x_train, y_train = data_loader.load_data(subset="train") x_valid, y_valid = data_loader.load_data(subset="valid") num_classes = len(np.unique(y_train)) num_samples, num_features = x_train.shape assert x_train.shape[1] == x_valid.shape[ 1], "Number of features should be equal!" assert x_train.shape[0] == y_train.shape[ 0], "Number of training samples should be equal!" assert x_valid.shape[0] == y_valid.shape[ 0], "Number of validation samples should be equal!" dev = get_default_device() tx = tensor.Tensor((num_samples, num_features), dev, tensor.float32) ty = tensor.Tensor((num_samples, ), dev, tensor.int32) sgd = opt.SGD(learning_rate) model = create_MLP_model(perceptron_size=10, num_classes=num_classes) model.set_optimizer(sgd) model.compile([tx], is_train=True, use_graph=True, sequential=False) model.train() for i in range(num_epochs): tx.copy_from_numpy(x_train.astype(np.float32)) ty.copy_from_numpy(y_train.astype(np.int32)) out, loss = model(tx, ty, 'fp32', spars=None) # TODO: Add metric evaluation on validation data if i % 10 == 0: print("training loss = {:.3f}".format(tensor.to_numpy(loss)[0]))
def init_dataset(): trainFile = gol.get_val("trainFile") testFile = gol.get_val("testFile") validationFile = gol.get_val("validationFile") Train_X, Train_Y, validation_X, validation_Y, Test_X, Test_Y = DataLoader.loadDataset( trainFile, validationFile, testFile) class_num = len(np.unique(Train_Y)) length = len(Train_Y) + len(validation_Y) + len(Test_Y) gol.set_val("Train_X", Train_X) gol.set_val("Train_Y", Train_Y) gol.set_val("validation_X", validation_X) gol.set_val("validation_Y", validation_Y) gol.set_val("Test_X", Test_X) gol.set_val("Test_Y", Test_Y)
def synthesis(args): m = Model() m_post = ModelPostNet() m_stop = ModelStopToken() m.load_state_dict(load_checkpoint(args.restore_step1, "transformer")) m_stop.load_state_dict(load_checkpoint(args.restore_step3, "stop_token")) m_post.load_state_dict(load_checkpoint(args.restore_step2, "postnet")) m=m.cuda() m_post = m_post.cuda() m_stop = m_stop.cuda() m.train(False) m_post.train(False) m_stop.train(False) test_dataset = get_dataset(hp.test_data_csv) test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn_transformer, drop_last=True, num_workers=1) ref_dataset = get_dataset(hp.test_data_csv) ref_dataloader = DataLoader(ref_dataset, batch_size=1, shuffle=True, collate_fn=collate_fn_transformer, drop_last=True, num_workers=1) writer = get_writer(hp.checkpoint_path, hp.log_directory) ref_dataloader_iter = iter(ref_dataloader) for i, data in enumerate(test_dataloader): character, mel, mel_input, pos_text, pos_mel, text_length, mel_length, fname = data ref_character, ref_mel, ref_mel_input, ref_pos_text, ref_pos_mel, ref_text_length, ref_mel_length, ref_fname = next(ref_dataloader_iter) stop_tokens = t.abs(pos_mel.ne(0).type(t.float) - 1) mel_input = t.zeros([1,1,80]).cuda() stop=[] character = character.cuda() mel = mel.cuda() mel_input = mel_input.cuda() pos_text = pos_text.cuda() pos_mel = pos_mel.cuda() ref_character = ref_character.cuda() ref_mel = ref_mel.cuda() ref_mel_input = ref_mel_input.cuda() ref_pos_text = ref_pos_text.cuda() ref_pos_mel = ref_pos_mel.cuda() with t.no_grad(): start=time.time() for i in range(args.max_len): pos_mel = t.arange(1,mel_input.size(1)+1).unsqueeze(0).cuda() mel_pred, postnet_pred, attn_probs, decoder_output, attns_enc, attns_dec, attns_style = m.forward(character, mel_input, pos_text, pos_mel, ref_mel, ref_pos_mel) stop_token = m_stop.forward(decoder_output) mel_input = t.cat([mel_input, postnet_pred[:,-1:,:]], dim=1) stop.append(t.sigmoid(stop_token).squeeze(-1)[0,-1]) if stop[-1] > 0.5: print("stop token at " + str(i) + " is :", stop[-1]) print("model inference time: ", time.time() - start) break if stop[-1] == 0: continue mag_pred = m_post.forward(postnet_pred) inf_time = time.time() - start print("inference time: ", inf_time) wav = spectrogram2wav(mag_pred.squeeze(0).cpu().numpy()) print("rtx : ", (len(wav)/hp.sr) / inf_time) wav_path = os.path.join(hp.sample_path, 'wav') if not os.path.exists(wav_path): os.makedirs(wav_path) write(os.path.join(wav_path, "text_{}_ref_{}_synth.wav".format(fname, ref_fname)), hp.sr, wav) print("written as text{}_ref_{}_synth.wav".format(fname, ref_fname)) attns_enc_new=[] attns_dec_new=[] attn_probs_new=[] attns_style_new=[] for i in range(len(attns_enc)): attns_enc_new.append(attns_enc[i].unsqueeze(0)) attns_dec_new.append(attns_dec[i].unsqueeze(0)) attn_probs_new.append(attn_probs[i].unsqueeze(0)) attns_style_new.append(attns_style[i].unsqueeze(0)) attns_enc = t.cat(attns_enc_new, 0) attns_dec = t.cat(attns_dec_new, 0) attn_probs = t.cat(attn_probs_new, 0) attns_style = t.cat(attns_style_new, 0) attns_enc = attns_enc.contiguous().view(attns_enc.size(0), 1, hp.n_heads, attns_enc.size(2), attns_enc.size(3)) attns_enc = attns_enc.permute(1,0,2,3,4) attns_dec = attns_dec.contiguous().view(attns_dec.size(0), 1, hp.n_heads, attns_dec.size(2), attns_dec.size(3)) attns_dec = attns_dec.permute(1,0,2,3,4) attn_probs = attn_probs.contiguous().view(attn_probs.size(0), 1, hp.n_heads, attn_probs.size(2), attn_probs.size(3)) attn_probs = attn_probs.permute(1,0,2,3,4) attns_style = attns_style.contiguous().view(attns_style.size(0), 1, hp.n_heads, attns_style.size(2), attns_style.size(3)) attns_style = attns_style.permute(1,0,2,3,4) save_dir = os.path.join(hp.sample_path, 'figure', "text_{}_ref_{}_synth.wav".format(fname, ref_fname)) if not os.path.exists(save_dir): os.makedirs(save_dir) writer.add_alignments(attns_enc.detach().cpu(), attns_dec.detach().cpu(), attn_probs.detach().cpu(), attns_style.detach().cpu(), mel_length, text_length, args.restore_step1, 'Validation', save_dir)
def main(): train_dataset = get_dataset(hp.train_data_csv) val_dataset = get_dataset(hp.val_data_csv) restore_step = hp.restore_step global_step = restore_step if restore_step != 0: restore_flag = True else: restore_flag = False m = Model() if os.path.exists('./checkpoints/checkpoint_%s_%d.pth.tar' % ('transformer', global_step)): state_dict = t.load('./checkpoints/checkpoint_%s_%d.pth.tar' % ('transformer', global_step)) new_state_dict = OrderedDict() for k, value in state_dict['model'].items(): key = k[7:] new_state_dict[key] = value m.load_state_dict(new_state_dict) m = nn.DataParallel(m.cuda()) m.train() vocoder = SmartVocoder(Hyperparameters(parse_args())) vocoder.load_state_dict( t.load('./mel2audio/checkpoint_step000588458.pth')["state_dict"]) vocoder = vocoder.cuda() vocoder.eval() optimizer = t.optim.Adam(m.parameters(), lr=hp.lr) writer = get_writer(hp.checkpoint_path, hp.log_directory) cur_epoch = 0 for epochs in range(hp.epochs): train_dataloader = DataLoader(train_dataset, batch_size=hp.batch_size, shuffle=True, collate_fn=collate_fn_transformer, drop_last=True, num_workers=1) val_dataloader = DataLoader(val_dataset, batch_size=hp.batch_size, shuffle=True, collate_fn=collate_fn_transformer, drop_last=True) if restore_flag: cur_epoch = int(restore_step / len(train_dataloader)) restore_flag = not restore_flag for i, data in enumerate(train_dataloader): global_step += 1 if global_step < 400000: adjust_learning_rate(optimizer, global_step) character, mel, mag, mel_input, pos_text, pos_mel, text_length, mel_length, fname = data mel_max_length_array = t.zeros(mel_length.size(0)).long() mel_max_length_array = t.LongTensor(mel_max_length_array) mel_max_length_array[:] = t.max(mel_length) mel_max_length_array = mel_max_length_array.cuda() character = character.cuda() mel = mel.cuda() mag = mag.cuda() mel_input = mel_input.cuda() pos_text = pos_text.cuda() pos_mel = pos_mel.cuda() text_length = text_length.cuda() mel_length = mel_length.cuda() loading_time = time.time() mask = get_mask_from_lengths(mel_length).cuda() mel_pred, postnet_pred, attn_probs, decoder_outputs, attns_enc, attns_dec, attns_style, post_linear, duration_predictor_output, duration, weights = m.forward( character, mel_input, pos_text, pos_mel, mel, pos_mel, mel_max_length_array=mel_max_length_array) mel_loss = t.mean( t.abs(mel_pred - mel).masked_select(mask.unsqueeze(-1))) post_mel_loss = t.mean( t.abs(postnet_pred - mel).masked_select(mask.unsqueeze(-1))) n_priority_freq = int(2000 / (hp.sr * 0.5) * (hp.n_fft / 2 + 1)) post_linear_loss = 0.5 * t.mean( t.abs(post_linear - mag).masked_select(mask.unsqueeze(-1)) ) + 0.5 * t.mean( t.abs(post_linear - mag)[:, :, :n_priority_freq].masked_select( mask.unsqueeze(-1))) duration_loss = nn.L1Loss()(t.sum( duration_predictor_output, -1, keepdim=True), mel_length) / t.sum(text_length) loss = (mel_loss + post_mel_loss + 0.3 * post_linear_loss + duration_loss) / hp.accum writer.add_losses(mel_loss.item(), post_mel_loss.item(), 0.3 * post_linear_loss, duration_loss, global_step, 'Train') # Calculate gradients loss.backward() msg = "| Epoch: {}, {}/{}th loss : {:.4f} + {:.4f} + {:.4f} + {:.4f} = {:.4f}".format( cur_epoch, i, len(train_dataloader), mel_loss, post_mel_loss, 0.3 * post_linear_loss, duration_loss, loss) stream(msg) if global_step % hp.accum == 0: nn.utils.clip_grad_norm_(m.parameters(), 1.) # Update weights optimizer.step() optimizer.zero_grad() if global_step % hp.val_step == 0 or global_step == 1: validate(m, vocoder, val_dataloader, global_step, writer) if global_step % hp.save_step == 0: t.save( { 'model': m.state_dict(), 'optimizer': optimizer.state_dict() }, os.path.join( hp.checkpoint_path, 'checkpoint_transformer_%d.pth.tar' % global_step)) if cur_epoch == hp.stop_epoch: break cur_epoch += 1 print(' ')
if not os.path.exists('alignments'): os.mkdir('alignments') check_point = './checkpoint/checkpoint_transformer_820000.pth.tar' para_file = t.load(check_point, map_location={'cuda:5': 'cuda:0'}) model = nn.DataParallel(Model().cuda()) model.load_state_dict(para_file['model']) model.eval() for epoch in range(1): dataset = get_dataset() dataloader = DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=collate_fn_transformer, drop_last=False, num_workers=1) k = 0 # pbar = tqdm(dataloader) # for i, data in enumerate(pbar): for character, mel, mel_input, pos_text, pos_mel, _ in dataloader: # pbar.set_description("Processing at epoch %d"%epoch) # character, mel, mel_input, pos_text, pos_mel, _ = data stop_tokens = t.abs(pos_mel.ne(0).type(t.float) - 1) character = character.cuda() mel = mel.cuda() mel_input = mel_input.cuda()
def main(): train_dataset = get_dataset(hp.train_data_csv) val_dataset = get_dataset(hp.val_data_csv) restore_step = hp.restore_step global_step = restore_step if restore_step != 0: restore_flag = True else: restore_flag = False m = Model() if os.path.exists('./checkpoints/checkpoint_%s_%d.pth.tar' % ('transformer', global_step)): state_dict = t.load('./checkpoints/checkpoint_%s_%d.pth.tar' % ('transformer', global_step)) new_state_dict = OrderedDict() for k, value in state_dict['model'].items(): key = k[7:] new_state_dict[key] = value m.load_state_dict(new_state_dict) m = nn.DataParallel(m.cuda()) m.train() optimizer = t.optim.Adam(m.parameters(), lr=hp.lr) writer = get_writer(hp.checkpoint_path, hp.log_directory) cur_epoch = 0 for epochs in range(hp.epochs): train_dataloader = DataLoader(train_dataset, batch_size=hp.batch_size, shuffle=True, collate_fn=collate_fn_transformer, drop_last=True, num_workers=1) val_dataloader = DataLoader(val_dataset, batch_size=hp.batch_size, shuffle=True, collate_fn=collate_fn_transformer, drop_last=True) if restore_flag: cur_epoch = int(restore_step / len(train_dataloader)) restore_flag = not restore_flag for i, data in enumerate(train_dataloader): global_step += 1 if global_step < 400000: adjust_learning_rate(optimizer, global_step) character, mel, mel_input, pos_text, pos_mel, text_length, mel_length, fname = data character = character.cuda() mel = mel.cuda() mel_input = mel_input.cuda() pos_text = pos_text.cuda() pos_mel = pos_mel.cuda() text_length = text_length.cuda() mel_length = mel_length.cuda() loading_time = time.time() mel_pred, postnet_pred, attn_probs, decoder_output, attns_enc, attns_dec, attns_style = m.forward( character, mel_input, pos_text, pos_mel, mel, pos_mel) mel_loss = nn.L1Loss()(mel_pred, mel) post_mel_loss = nn.L1Loss()(postnet_pred, mel) loss = (mel_loss + post_mel_loss) / hp.accum writer.add_losses(mel_loss.item(), post_mel_loss.item(), global_step, 'Train') # Calculate gradients loss.backward() msg = "| Epoch: {}, {}/{}th loss : {:.4f} + {:.4f} = {:.4f}".format( cur_epoch, i, len(train_dataloader), mel_loss, post_mel_loss, loss) stream(msg) if global_step % hp.accum == 0: nn.utils.clip_grad_norm_(m.parameters(), 1.) # Update weights optimizer.step() optimizer.zero_grad() if global_step % hp.val_step == 0 or global_step == 1: validate(m, val_dataloader, global_step, writer) if global_step % hp.save_step == 0: t.save( { 'model': m.state_dict(), 'optimizer': optimizer.state_dict() }, os.path.join( hp.checkpoint_path, 'checkpoint_transformer_%d.pth.tar' % global_step)) if cur_epoch == hp.stop_epoch: break cur_epoch += 1 print(' ')
def synthesis(args): m = Model() m.load_state_dict(load_checkpoint(args.restore_step1, "transformer")) m = m.cuda() m.train(False) vocoder = SmartVocoder(Hyperparameters(parse_args())) vocoder.load_state_dict( t.load('./mel2audio/merged_STFT_checkpoint.pth')["state_dict"]) vocoder = vocoder.cuda() vocoder.eval() with open('./hifi_gan/config.json') as f: data = f.read() json_config = json.loads(data) h = AttrDict(json_config) hifi_gan = Generator(h).cuda() state_dict_g = t.load('./hifi_gan/g_00334000', map_location='cuda') hifi_gan.load_state_dict(state_dict_g['generator']) hifi_gan.eval() hifi_gan.remove_weight_norm() test_dataset = get_dataset(hp.test_data_csv) test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn_transformer, drop_last=True, num_workers=1) ref_dataset = get_dataset(hp.test_data_csv_shuf) ref_dataloader = DataLoader(ref_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn_transformer, drop_last=True, num_workers=1) writer = get_writer(hp.checkpoint_path, hp.log_directory) mel_basis = t.from_numpy( librosa.filters.mel(hp.sr, hp.n_fft, hp.n_mels, 50, 11000)).unsqueeze(0) # (n_mels, 1+n_fft//2) ref_dataloader_iter = iter(ref_dataloader) _, ref_mel, _, _, _, ref_pos_mel, _, _, ref_fname = next( ref_dataloader_iter) for i, data in enumerate(test_dataloader): character, _, _, _, pos_text, _, text_length, _, fname = data mel_input = t.zeros([1, 1, 80]).cuda() character = character.cuda() ref_mel = ref_mel.cuda() mel_input = mel_input.cuda() pos_text = pos_text.cuda() with t.no_grad(): start = time.time() memory, c_mask, attns_enc, duration_mask = m.encoder(character, pos=pos_text) style, coarse_emb = m.ref_encoder(ref_mel) memory = t.cat((memory, coarse_emb.expand(-1, memory.size(1), -1)), -1) memory = m.memory_coarse_layer(memory) duration_predictor_output = m.duration_predictor( memory, duration_mask) duration = t.ceil(duration_predictor_output) duration = duration * duration_mask # max_length = t.sum(duration).type(t.LongTensor) # print("length : ", max_length) monotonic_interpolation, pos_mel_, weights = m.length_regulator( memory, duration, duration_mask) kv_mask = t.zeros([1, mel_input.size(1), character.size(1)]).cuda() # B, t', N kv_mask[:, :, :3] = 1 kv_mask = kv_mask.eq(0) stop_flag = False ctr = 0 for j in range(1200): pos_mel = t.arange(1, mel_input.size(1) + 1).unsqueeze(0).cuda() mel_pred, postnet_pred, attn_probs, decoder_output, attns_dec, attns_style = m.decoder( memory, style, mel_input, c_mask, pos=pos_mel, ref_pos=ref_pos_mel, mono_inter=monotonic_interpolation[:, :mel_input.shape[1]], kv_mask=kv_mask) mel_input = t.cat([mel_input, postnet_pred[:, -1:, :]], dim=1) # print("j", j, "mel_input", mel_input.shape) if stop_flag and ctr == 10: break elif stop_flag: ctr += 1 kv_mask, stop_flag = update_kv_mask( kv_mask, attn_probs) # B, t', N --> B, t'+1, N postnet_pred = t.cat((postnet_pred, t.zeros(postnet_pred.size(0), 5, postnet_pred.size(-1)).cuda()), 1) gen_length = mel_input.size(1) # print("gen_length", gen_length) post_linear = m.postnet(postnet_pred) post_linear = resample(post_linear, seq_len=mel_input.size(1), scale=args.rhythm_scale) postnet_pred = resample(mel_input, seq_len=mel_input.size(1), scale=args.rhythm_scale) inf_time = time.time() - start print("inference time: ", inf_time) # print("speech_rate: ", len(postnet_pred[0])/len(character[0])) postnet_pred_v = postnet_pred.transpose(2, 1) postnet_pred_v = (postnet_pred_v * 100 + 20 - 100) / 20 B, C, T = postnet_pred_v.shape z = t.randn(1, 1, T * hp.hop_length).cuda() z = z * 0.6 # Temp t.cuda.synchronize() timestemp = time.time() with t.no_grad(): y_gen = vocoder.reverse(z, postnet_pred_v).squeeze() t.cuda.synchronize() print('{} seconds'.format(time.time() - timestemp)) wav = y_gen.to(t.device("cpu")).data.numpy() wav = np.pad( wav, [0, 4800], mode='constant', constant_values=0) #pad 0 for 0.21 sec silence at the end post_linear_v = post_linear.transpose(1, 2) post_linear_v = 10**((post_linear_v * 100 + 20 - 100) / 20) mel_basis = mel_basis.repeat(post_linear_v.shape[0], 1, 1) post_linear_mel_v = t.log10(t.bmm(mel_basis.cuda(), post_linear_v)) B, C, T = post_linear_mel_v.shape z = t.randn(1, 1, T * hp.hop_length).cuda() z = z * 0.6 # Temp t.cuda.synchronize() timestemp = time.time() with t.no_grad(): y_gen_linear = vocoder.reverse(z, post_linear_mel_v).squeeze() t.cuda.synchronize() wav_linear = y_gen_linear.to(t.device("cpu")).data.numpy() wav_linear = np.pad( wav_linear, [0, 4800], mode='constant', constant_values=0) #pad 0 for 0.21 sec silence at the end wav_hifi = hifi_gan(post_linear_mel_v).squeeze().clamp( -1, 1).detach().cpu().numpy() wav_hifi = np.pad( wav_hifi, [0, 4800], mode='constant', constant_values=0) #pad 0 for 0.21 sec silence at the end mel_path = os.path.join(hp.sample_path + '_' + str(args.rhythm_scale), 'mel') if not os.path.exists(mel_path): os.makedirs(mel_path) np.save( os.path.join( mel_path, 'text_{}_ref_{}_synth_{}.mel'.format(i, ref_fname, str(args.rhythm_scale))), postnet_pred.cpu()) linear_path = os.path.join( hp.sample_path + '_' + str(args.rhythm_scale), 'linear') if not os.path.exists(linear_path): os.makedirs(linear_path) np.save( os.path.join( linear_path, 'text_{}_ref_{}_synth_{}.linear'.format( i, ref_fname, str(args.rhythm_scale))), post_linear.cpu()) wav_path = os.path.join(hp.sample_path + '_' + str(args.rhythm_scale), 'wav') if not os.path.exists(wav_path): os.makedirs(wav_path) write( os.path.join( wav_path, "text_{}_ref_{}_synth_{}.wav".format(i, ref_fname, str(args.rhythm_scale))), hp.sr, wav) print("rtx : ", (len(wav) / hp.sr) / inf_time) wav_linear_path = os.path.join( hp.sample_path + '_' + str(args.rhythm_scale), 'wav_linear') if not os.path.exists(wav_linear_path): os.makedirs(wav_linear_path) write( os.path.join( wav_linear_path, "text_{}_ref_{}_synth_{}.wav".format(i, ref_fname, str(args.rhythm_scale))), hp.sr, wav_linear) wav_hifi_path = os.path.join( hp.sample_path + '_' + str(args.rhythm_scale), 'wav_hifi') if not os.path.exists(wav_hifi_path): os.makedirs(wav_hifi_path) write( os.path.join( wav_hifi_path, "text_{}_ref_{}_synth_{}.wav".format(i, ref_fname, str(args.rhythm_scale))), hp.sr, wav_hifi) show_weights = weights.contiguous().view(weights.size(0), 1, 1, weights.size(1), weights.size(2)) attns_enc_new = [] attns_dec_new = [] attn_probs_new = [] attns_style_new = [] for i in range(len(attns_enc)): attns_enc_new.append(attns_enc[i].unsqueeze(0)) attns_dec_new.append(attns_dec[i].unsqueeze(0)) attn_probs_new.append(attn_probs[i].unsqueeze(0)) attns_style_new.append(attns_style[i].unsqueeze(0)) attns_enc = t.cat(attns_enc_new, 0) attns_dec = t.cat(attns_dec_new, 0) attn_probs = t.cat(attn_probs_new, 0) attns_style = t.cat(attns_style_new, 0) attns_enc = attns_enc.contiguous().view(attns_enc.size(0), 1, hp.n_heads, attns_enc.size(2), attns_enc.size(3)) attns_enc = attns_enc.permute(1, 0, 2, 3, 4) attns_dec = attns_dec.contiguous().view(attns_dec.size(0), 1, hp.n_heads, attns_dec.size(2), attns_dec.size(3)) attns_dec = attns_dec.permute(1, 0, 2, 3, 4) attn_probs = attn_probs.contiguous().view(attn_probs.size(0), 1, hp.n_heads, attn_probs.size(2), attn_probs.size(3)) attn_probs = attn_probs.permute(1, 0, 2, 3, 4) attns_style = attns_style.contiguous().view(attns_style.size(0), 1, hp.n_heads, attns_style.size(2), attns_style.size(3)) attns_style = attns_style.permute(1, 0, 2, 3, 4) save_dir = os.path.join( hp.sample_path + '_' + str(args.rhythm_scale), 'figure', "text_{}_ref_{}_synth_{}.wav".format(fname, ref_fname, str(args.rhythm_scale))) if not os.path.exists(save_dir): os.makedirs(save_dir) writer.add_alignments(attns_enc.detach().cpu(), attns_dec.detach().cpu(), attn_probs.detach().cpu(), attns_style.detach().cpu(), show_weights.detach().cpu(), [t.tensor(gen_length).type(t.LongTensor)], text_length, args.restore_step1, 'Inference', save_dir)
def main(): if not os.path.exists("logger"): os.mkdir("logger") dataset = get_dataset() global_step = 0 m = nn.DataParallel(Model().cuda()) num_param = sum(param.numel() for param in m.parameters()) print('Number of Transformer-TTS Parameters:', num_param) m.train() optimizer = t.optim.Adam(m.parameters(), lr=hp.lr) pos_weight = t.FloatTensor([5.]).cuda() # writer = SummaryWriter() for epoch in range(hp.epochs): dataloader = DataLoader(dataset, batch_size=hp.batch_size, shuffle=True, collate_fn=collate_fn_transformer, drop_last=True, num_workers=16) # pbar = tqdm(dataloader) for i, data in enumerate(dataloader): # pbar.set_description("Processing at epoch %d"%epoch) global_step += 1 if global_step < 400000: adjust_learning_rate(optimizer, global_step) character, mel, mel_input, pos_text, pos_mel, _ = data stop_tokens = t.abs(pos_mel.ne(0).type(t.float) - 1) character = character.cuda() mel = mel.cuda() mel_input = mel_input.cuda() pos_text = pos_text.cuda() pos_mel = pos_mel.cuda() # print(mel) mel_pred, postnet_pred, attn_probs, stop_preds, attns_enc, attns_dec = m.forward( character, mel_input, pos_text, pos_mel) mel_loss = nn.L1Loss()(mel_pred, mel) post_mel_loss = nn.L1Loss()(postnet_pred, mel) loss = mel_loss + post_mel_loss t_l = loss.item() m_l = mel_loss.item() m_p_l = post_mel_loss.item() # s_l = stop_pred_loss.item() with open(os.path.join("logger", "total_loss.txt"), "a") as f_total_loss: f_total_loss.write(str(t_l) + "\n") with open(os.path.join("logger", "mel_loss.txt"), "a") as f_mel_loss: f_mel_loss.write(str(m_l) + "\n") with open(os.path.join("logger", "mel_postnet_loss.txt"), "a") as f_mel_postnet_loss: f_mel_postnet_loss.write(str(m_p_l) + "\n") # with open(os.path.join("logger", "stop_pred_loss.txt"), "a") as f_s_loss: # f_s_loss.write(str(s_l)+"\n") # Print if global_step % hp.log_step == 0: # Now = time.clock() str1 = "Epoch [{}/{}], Step [{}], Mel Loss: {:.4f}, Mel PostNet Loss: {:.4f};".format( epoch + 1, hp.epochs, global_step, mel_loss.item(), post_mel_loss.item()) str2 = "Total Loss: {:.4f}.".format(loss.item()) current_learning_rate = 0 for param_group in optimizer.param_groups: current_learning_rate = param_group['lr'] str3 = "Current Learning Rate is {:.6f}.".format( current_learning_rate) # str4 = "Time Used: {:.3f}s, Estimated Time Remaining: {:.3f}s.".format( # (Now-Start), (total_step-current_step)*np.mean(Time)) print("\n" + str1) print(str2) print(str3) # print(str4) with open(os.path.join("logger", "logger.txt"), "a") as f_logger: f_logger.write(str1 + "\n") f_logger.write(str2 + "\n") f_logger.write(str3 + "\n") # f_logger.write(str4 + "\n") f_logger.write("\n") # writer.add_scalars('training_loss',{ # 'mel_loss':mel_loss, # 'post_mel_loss':post_mel_loss, # }, global_step) # writer.add_scalars('alphas',{ # 'encoder_alpha':m.module.encoder.alpha.data, # 'decoder_alpha':m.module.decoder.alpha.data, # }, global_step) # if global_step % hp.image_step == 1: # for i, prob in enumerate(attn_probs): # num_h = prob.size(0) # for j in range(4): # x = vutils.make_grid(prob[j*16] * 255) # writer.add_image('Attention_%d_0'%global_step, x, i*4+j) # for i, prob in enumerate(attns_enc): # num_h = prob.size(0) # for j in range(4): # x = vutils.make_grid(prob[j*16] * 255) # writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j) # for i, prob in enumerate(attns_dec): # num_h = prob.size(0) # for j in range(4): # x = vutils.make_grid(prob[j*16] * 255) # writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j) optimizer.zero_grad() # Calculate gradients loss.backward() nn.utils.clip_grad_norm_(m.parameters(), 1.) # Update weights optimizer.step() if global_step % hp.save_step == 0: t.save( { 'model': m.state_dict(), 'optimizer': optimizer.state_dict() }, os.path.join( hp.checkpoint_path, 'checkpoint_transformer_%d.pth.tar' % global_step))
def main(): dataset = get_dataset(hp.train_data_csv) global_step = 0 m = nn.DataParallel(ModelStopToken().cuda()) trans_model = Model() trans_model.load_state_dict(load_checkpoint(100000, "transformer")) for name, param in trans_model.named_parameters(): param.requires_grad = False print(name, " : weight frozen") trans_model = nn.DataParallel(trans_model.cuda()) m.train() trans_model.train(False) optimizer = t.optim.Adam(m.parameters(), lr=hp.lr) writer = SummaryWriter() for epoch in range(hp.epochs): dataloader = DataLoader(dataset, batch_size=hp.batch_size, shuffle=True, collate_fn=collate_fn_transformer, drop_last=True, num_workers=8) for i, data in enumerate(dataloader): global_step += 1 if global_step < 400000: adjust_learning_rate(optimizer, global_step) character, mel, mel_input, pos_text, pos_mel, text_length, mel_length, fname = data character = character.cuda() mel = mel.cuda() mel_input = mel_input.cuda() pos_text = pos_text.cuda() pos_mel = pos_mel.cuda() mel_length = mel_length.cuda() stop_tokens = t.abs(pos_mel.ne(0).type(t.float) - 1).cuda() for j, length in enumerate(mel_length): stop_tokens[j, length - 1] += 1 mel_pred, postnet_pred, attn, decoder_output, _, attn_dec, attn_style = trans_model.forward( character, mel_input, pos_text, pos_mel, mel, pos_mel) stop_preds = m.forward(decoder_output) if global_step % 100 == 0: print("pos_mel", pos_mel[0]) print("stop_pred", t.sigmoid(stop_preds.squeeze()[0])) print("stop_tokens", stop_tokens[0]) mask = get_mask_from_lengths(mel_length) stop_preds = stop_preds.squeeze().masked_select(mask) stop_tokens = stop_tokens.masked_select(mask) loss = nn.BCEWithLogitsLoss( pos_weight=t.tensor(hp.bce_pos_weight))(stop_preds, stop_tokens) print("| Epoch: {}, {}/{}th loss : {:.4f}".format( epoch, i, len(dataloader), loss)) writer.add_scalars('training_loss', { 'loss': loss, }, global_step) optimizer.zero_grad() # Calculate gradients loss.backward() nn.utils.clip_grad_norm_(m.parameters(), 1.) # Update weights optimizer.step() if global_step % hp.save_step == 0: t.save( { 'model': m.state_dict(), 'optimizer': optimizer.state_dict() }, os.path.join( hp.checkpoint_path, 'checkpoint_stop_token_%d.pth.tar' % global_step)) if epoch == hp.stop_epoch: break
def main(): dataset = get_dataset() global_step = 0 sum_loss = 0 m = nn.DataParallel(Model().cuda()) # TODO:dataparalle # m = Model().cuda() m.train() optimizer = t.optim.Adam(m.parameters(), lr=hp.lr) pos_weight = t.FloatTensor([5.]).cuda() writer = SummaryWriter() for epoch in range(hp.epochs): dataloader = DataLoader(dataset, batch_size=hp.batch_size, shuffle=True, collate_fn=collate_fn_transformer, drop_last=True, num_workers=16) pbar = tqdm(dataloader) sum_loss = 0 for i, data in enumerate(pbar): pbar.set_description("Processing at epoch %d"%epoch) global_step += 1 if global_step < 400000: adjust_learning_rate(optimizer, global_step) character, mel, mel_input, pos_text, pos_mel, _ = data stop_tokens = t.abs(pos_mel.ne(0).type(t.float) - 1) character = character.cuda() mel = mel.cuda() mel_input = mel_input.cuda() pos_text = pos_text.cuda() pos_mel = pos_mel.cuda() mel_pred, postnet_pred, attn_probs, stop_preds, attns_enc, attns_dec = m.forward(character, mel_input, pos_text, pos_mel) mel_loss = nn.L1Loss()(mel_pred, mel) post_mel_loss = nn.L1Loss()(postnet_pred, mel) loss = mel_loss + post_mel_loss writer.add_scalars('training_loss',{ 'mel_loss':mel_loss, 'post_mel_loss':post_mel_loss, }, global_step) writer.add_scalars('alphas',{ 'encoder_alpha':m.module.encoder.alpha.data, 'decoder_alpha':m.module.decoder.alpha.data, }, global_step) if global_step % hp.image_step == 1: for i, prob in enumerate(attn_probs): num_h = prob.size(0) for j in range(4): x = vutils.make_grid(prob[j*16] * 255) writer.add_image('Attention_%d_0'%global_step, x, i*4+j) for i, prob in enumerate(attns_enc): num_h = prob.size(0) for j in range(4): x = vutils.make_grid(prob[j*16] * 255) writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j) for i, prob in enumerate(attns_dec): num_h = prob.size(0) for j in range(4): x = vutils.make_grid(prob[j*16] * 255) writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j) optimizer.zero_grad() # Calculate gradients loss.backward() nn.utils.clip_grad_norm_(m.parameters(), 1.) # Update weights optimizer.step() if global_step % hp.save_step == 0: t.save({'model':m.state_dict(), 'optimizer':optimizer.state_dict()}, os.path.join(hp.checkpoint_path,'checkpoint_transformer_%d.pth.tar' % global_step)) sum_loss += loss.item() print(f'epoch:{epoch}, sum_loss: {sum_loss / (i + 1)}')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--step', type=int, help='Global step to restore checkpoint', default=0) args = parser.parse_args() dataset = get_post_dataset() global_step = args.step m = nn.DataParallel(ModelPostNet().cuda(1), device_ids=[i+1 for i in range(7)]) if not os.path.exists(hp.checkpoint_path): os.makedirs(hp.checkpoint_path) if args.step > 0: ckpt_path = os.path.join(hp.checkpoint_path,'checkpoint_postnet_%d.pth.tar' % global_step) ckpt = torch.load(ckpt_path) m.load_state_dict(ckpt['model']) m.train() optimizer = torch.optim.Adam(m.parameters(), lr=hp.lr) if args.step > 0: optimizer.load_state_dict(ckpt['optimizer']) writer = SummaryWriter() for epoch in range(hp.epochs): dataloader = DataLoader(dataset, batch_size=hp.batch_size, shuffle=True, collate_fn=collate_fn_postnet, drop_last=True, num_workers=8) pbar = tqdm(dataloader) for i, data in enumerate(pbar): pbar.set_description("Processing at epoch %d"%epoch) global_step += 1 if global_step < 400000: adjust_learning_rate(optimizer, global_step) mel, mag = data mel = mel.cuda(1) mag = mag.cuda(1) mag_pred = m.forward(mel) loss = nn.L1Loss()(mag_pred, mag) writer.add_scalars('training_loss',{ 'loss':loss, }, global_step) optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(m.parameters(), 1.) optimizer.step() if global_step % hp.save_step_post == 0: torch.save({'model':m.state_dict(), 'optimizer':optimizer.state_dict()}, os.path.join(hp.checkpoint_path,'checkpoint_postnet_%d.pth.tar' % global_step))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--step', type=int, help='Global step to restore checkpoint', default=0) args = parser.parse_args() dataset = get_dataset() global_step = args.step m = Model().cuda() m = nn.DataParallel(m, device_ids=[i for i in range(8)]) if not os.path.exists(hp.checkpoint_path): os.makedirs(hp.checkpoint_path) if args.step > 0: ckpt_path = os.path.join( hp.checkpoint_path, 'checkpoint_transformer_%d.pth.tar' % global_step) ckpt = torch.load(ckpt_path) m.load_state_dict(ckpt['model']) m.train() optimizer = torch.optim.Adam(m.parameters(), lr=hp.lr) if args.step > 0: optimizer.load_state_dict(ckpt['optimizer']) pos_weight = torch.FloatTensor([5.]).cuda() writer = SummaryWriter() for epoch in range(hp.epochs): dataloader = DataLoader(dataset, batch_size=hp.batch_size, shuffle=True, collate_fn=collate_fn_transformer, drop_last=True, num_workers=16) pbar = tqdm(dataloader) for i, data in enumerate(pbar): pbar.set_description("Processing at epoch %d" % epoch) global_step += 1 if global_step < 400000: adjust_learning_rate(optimizer, global_step) character, mel, mel_input, pos_text, pos_mel, _ = data stop_tokens = torch.abs(pos_mel.ne(0).type(torch.float) - 1) character = character.cuda() mel = mel.cuda() mel_input = mel_input.cuda() pos_text = pos_text.cuda() pos_mel = pos_mel.cuda() mel_pred, postnet_pred, attn_probs, stop_preds, attns_enc, attns_dec = m.forward( character, mel_input, pos_text, pos_mel) mel_loss = nn.L1Loss()(mel_pred, mel) post_mel_loss = nn.L1Loss()(postnet_pred, mel) loss = mel_loss + post_mel_loss writer.add_scalars('training_loss', { 'mel_loss': mel_loss, 'post_mel_loss': post_mel_loss, }, global_step) writer.add_scalars( 'alphas', { 'encoder_alpha': m.module.encoder.alpha.data, 'decoder_alpha': m.module.decoder.alpha.data, }, global_step) if global_step % hp.image_step == 1: for i, prob in enumerate(attn_probs): num_h = prob.size(0) for j in range(4): x = vutils.make_grid(prob[j * 16] * 255) writer.add_image('Attention_%d_0' % global_step, x, i * 4 + j) for i, prob in enumerate(attns_enc): num_h = prob.size(0) for j in range(4): x = vutils.make_grid(prob[j * 16] * 255) writer.add_image('Attention_enc_%d_0' % global_step, x, i * 4 + j) for i, prob in enumerate(attns_dec): num_h = prob.size(0) for j in range(4): x = vutils.make_grid(prob[j * 16] * 255) writer.add_image('Attention_dec_%d_0' % global_step, x, i * 4 + j) optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(m.parameters(), 1.) optimizer.step() if global_step % hp.save_step == 0: torch.save( { 'model': m.state_dict(), 'optimizer': optimizer.state_dict() }, os.path.join( hp.checkpoint_path, 'checkpoint_transformer_%d.pth.tar' % global_step))
def main(): print('starting here...') dataset = get_dataset() global_step = 0 m = nn.DataParallel(Model().cuda()) # if LOADCHECKPOINT: # m.load_state_dict(t.load(hp.checkpoint_file_transformer)) # print('loaded checkpoint...') # m.eval() m.train() optimizer = t.optim.Adam(m.parameters(), lr=hp.lr) pos_weight = t.FloatTensor([5.]).cuda() writer = SummaryWriter() for epoch in range(hp.epochs): print('at epoch', epoch) dataloader = DataLoader(dataset, batch_size=hp.batch_size, shuffle=True, collate_fn=collate_fn_transformer, drop_last=True, num_workers=1) pbar = tqdm(dataloader) for i, data in enumerate(pbar): pbar.set_description("Processing at epoch %d" % epoch) global_step += 1 if global_step < 400000: adjust_learning_rate(optimizer, global_step) eeg_array, mel, mel_input, pos_eeg_signal, pos_mel, _ = data stop_tokens = t.abs(pos_mel.ne(0).type(t.float) - 1) eeg_array = eeg_array.cuda() mel = mel.cuda() mel_input = mel_input.cuda() pos_eeg_signal = pos_eeg_signal.cuda() pos_mel = pos_mel.cuda() print('before m.forward()...') mel_pred, postnet_pred, attn_probs, stop_preds, attns_enc, attns_dec = m.forward( eeg_array, mel_input, pos_eeg_signal, pos_mel) mel_loss = nn.L1Loss()(mel_pred, mel) post_mel_loss = nn.L1Loss()(postnet_pred, mel) loss = mel_loss + post_mel_loss writer.add_scalars('training_loss', { 'mel_loss': mel_loss, 'post_mel_loss': post_mel_loss, }, global_step) writer.add_scalars( 'alphas', { 'encoder_alpha': m.module.encoder.alpha.data, 'decoder_alpha': m.module.decoder.alpha.data, }, global_step) if global_step % hp.image_step == 1: # summarywriter add_image params num_images_per_loop = 4 writer_start_val = int(hp.batch_size / 2) writer_end_val = int(hp.batch_size * num_images_per_loop) writer_step_val = int(hp.batch_size) for i, prob in enumerate(attn_probs): num_h = prob.size(0) for j in range(writer_start_val, writer_end_val, writer_step_val): x = vutils.make_grid([prob[j] * 255]) # x = prob[j] * 255 writer.add_image('Attention_%d_0' % global_step, x, i * num_images_per_loop + j) for i, prob in enumerate(attns_enc): num_h = prob.size(0) for j in range(writer_start_val, writer_end_val, writer_step_val): x = vutils.make_grid([prob[j] * 255]) # x = prob[j] * 255 writer.add_image('Attention_enc_%d_0' % global_step, x, i * num_images_per_loop + j) for i, prob in enumerate(attns_dec): num_h = prob.size(0) for j in range(writer_start_val, writer_end_val, writer_step_val): x = vutils.make_grid([prob[j] * 255]) # x = prob[j] * 255 writer.add_image('Attention_dec_%d_0' % global_step, x, i * num_images_per_loop + j) optimizer.zero_grad() # Calculate gradients loss.backward() nn.utils.clip_grad_norm_(m.parameters(), 1.) # Update weights optimizer.step() if global_step % hp.save_step == 0: t.save( { 'model': m.state_dict(), 'optimizer': optimizer.state_dict() }, os.path.join( hp.checkpoint_path, 'checkpoint_transformer_%d.pth.tar' % global_step))
if not os.path.isdir(log_dir): os.mkdir(log_dir) if args.p: logger.info("Multiple process mode") else: logger.info("Single process mode") data_pkl = os.path.join(pwd(__file__), './data/data.pkl') if os.path.isfile(data_pkl): logger.info("Exsiting pkl,loading...") datas = load_pickle(data_pkl) else: logger.info("Loading from json") loader = DataLoader() datas = loader() logger.info("Shuffle data") random.shuffle(datas) random.shuffle(datas) logger.info("Serialize data") dump_pickle(datas, data_pkl) logger.info("Loaded data {}".format(len(datas))) # data_per_worker = int(1e2)*6 data_per_worker = int(args.s) num_worker = round(len(datas) / data_per_worker) logger.info("Data per worker={}, num worker={}".format( data_per_worker, num_worker))
def main(): dataset = get_dataset() global_step = 0 # inference: https://blog.csdn.net/weixin_40087578/article/details/87186613 m = nn.DataParallel( Model().cuda()) # 将data分配给多GPU,默认用0号卡训练。如使用多卡,需提前指定device编号并设置环境变量 m.train() optimizer = t.optim.Adam(m.parameters(), lr=hp.lr) # Adam pos_weight = t.FloatTensor([5.]).cuda() writer = SummaryWriter() for epoch in range(hp.epochs): dataloader = DataLoader(dataset, batch_size=hp.batch_size, shuffle=True, collate_fn=collate_fn_transformer, drop_last=True, num_workers=16) pbar = tqdm(dataloader) for i, data in enumerate(pbar): pbar.set_description("Processing at epoch %d" % epoch) global_step += 1 if global_step < 400000: adjust_learning_rate(optimizer, global_step) # 调整学习率。但对Adam来说,似乎没什么必要。 # pos_text和pos_mel是全局排序。 character, mel, mel_input, pos_text, pos_mel, _ = data #取data stop_tokens = t.abs(pos_mel.ne(0).type(t.float) - 1) character = character.cuda() #data拷贝至GPU mel = mel.cuda() mel_input = mel_input.cuda() pos_text = pos_text.cuda() pos_mel = pos_mel.cuda() mel_pred, postnet_pred, attn_probs, stop_preds, attns_enc, attns_dec = m.forward( character, mel_input, pos_text, pos_mel) # 这里的stop_token原本是用来标记音频结尾的符号。但代码作者表示,按原文加上loss会使模型不收敛。后续生成的 时候也只能凭借经验值确定生成长度。 mel_loss = nn.L1Loss()(mel_pred, mel) # L1 loss post_mel_loss = nn.L1Loss()(postnet_pred, mel) loss = mel_loss + post_mel_loss writer.add_scalars('training_loss', { 'mel_loss': mel_loss, 'post_mel_loss': post_mel_loss, }, global_step) writer.add_scalars( 'alphas', { 'encoder_alpha': m.module.encoder.alpha.data, 'decoder_alpha': m.module.decoder.alpha.data, }, global_step) if global_step % hp.image_step == 1: for i, prob in enumerate(attn_probs): num_h = prob.size(0) for j in range(4): x = vutils.make_grid(prob[j * 16] * 255) writer.add_image('Attention_%d_0' % global_step, x, i * 4 + j) for i, prob in enumerate(attns_enc): num_h = prob.size(0) for j in range(4): x = vutils.make_grid(prob[j * 16] * 255) writer.add_image('Attention_enc_%d_0' % global_step, x, i * 4 + j) for i, prob in enumerate(attns_dec): num_h = prob.size(0) for j in range(4): x = vutils.make_grid(prob[j * 16] * 255) writer.add_image('Attention_dec_%d_0' % global_step, x, i * 4 + j) optimizer.zero_grad() # 手动清零梯度数组,方便下次计算。 # Calculate gradients loss.backward() # BP nn.utils.clip_grad_norm_(m.parameters(), 1.) # 梯度裁剪 # Update weights 更新权重。 optimizer.step() if global_step % hp.save_step == 0: t.save( { 'model': m.state_dict(), 'optimizer': optimizer.state_dict() }, os.path.join( hp.checkpoint_path, 'checkpoint_transformer_%d.pth.tar' % global_step))
def main(args): dataset = get_dataset() global_step = args.restore_step m = nn.DataParallel(Model().cuda()) # # print(type(m.module)) # for block in m.module: # for each in block.parameters(): # print(each.reqiures_grad) # for paras in m.parameters(): # print(paras.size(), paras.requires_grad) m.train() optimizer = t.optim.Adam(m.parameters(), lr=hp.lr) # print(os.path.join( # hp.checkpoint_path, 'checkpoint_transformer_%d.pth.tar' % args.restore_step)) try: print( os.path.join( hp.checkpoint_path, 'checkpoint_transformer_%d.pth.tar' % args.restore_step)) checkpoint = torch.load( os.path.join( hp.checkpoint_path, 'checkpoint_transformer_%d.pth.tar' % args.restore_step)) m.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("\n---Model Restored at Step %d---\n" % args.restore_step) except: print("\n---Start New Training---\n") if not os.path.exists(hp.checkpoint_path): os.mkdir(hp.checkpoint_path) pos_weight = t.FloatTensor([5.]).cuda() writer = SummaryWriter() for epoch in range(args.start_epoch, hp.epochs): dataloader = DataLoader(dataset, batch_size=hp.batch_size, shuffle=True, collate_fn=collate_fn_transformer, drop_last=True, num_workers=0) pbar = tqdm(dataloader) for i, data in enumerate(pbar): pbar.set_description("Processing at epoch %d" % epoch) global_step += 1 if global_step < 400000: adjust_learning_rate(optimizer, global_step) character, mel, mel_input, pos_text, pos_mel, _ = data stop_tokens = t.abs(pos_mel.ne(0).type(t.float) - 1) character = character.cuda() mel = mel.cuda() mel_input = mel_input.cuda() pos_text = pos_text.cuda() pos_mel = pos_mel.cuda() mel_pred, postnet_pred, attn_probs, stop_preds, attns_enc, attns_dec = m.forward( character, mel_input, pos_text, pos_mel) mel_loss = nn.L1Loss()(mel_pred, mel) post_mel_loss = nn.L1Loss()(postnet_pred, mel) loss = mel_loss + post_mel_loss writer.add_scalars('training_loss', { 'mel_loss': mel_loss, 'post_mel_loss': post_mel_loss, }, global_step) writer.add_scalars( 'alphas', { 'encoder_alpha': m.module.encoder.alpha.data, 'decoder_alpha': m.module.decoder.alpha.data, }, global_step) if global_step % hp.image_step == 1: for i, prob in enumerate(attn_probs): num_h = prob.size(0) for j in range(4): x = vutils.make_grid(prob[j * 16] * 255) writer.add_image('Attention_%d_0' % global_step, x, i * 4 + j) for i, prob in enumerate(attns_enc): num_h = prob.size(0) for j in range(4): x = vutils.make_grid(prob[j * 16] * 255) writer.add_image('Attention_enc_%d_0' % global_step, x, i * 4 + j) for i, prob in enumerate(attns_dec): num_h = prob.size(0) for j in range(4): x = vutils.make_grid(prob[j * 16] * 255) writer.add_image('Attention_dec_%d_0' % global_step, x, i * 4 + j) optimizer.zero_grad() # Calculate gradients loss.backward() nn.utils.clip_grad_norm_(m.parameters(), 1.) # Update weights optimizer.step() if global_step % hp.save_step == 0: t.save( { 'model': m.state_dict(), 'optimizer': optimizer.state_dict() }, os.path.join( hp.checkpoint_path, 'checkpoint_transformer_%d.pth.tar' % global_step))