def _make_text_list(self, txt_origin, txt_prepro): text_list = [] for i in range(len(txt_prepro)): if type(txt_prepro[i]) is str: t = txt_prepro[i] else: t = txt_origin[i] text_list.append(text_normalize(t)) return text_list
def prepare_minibatch(self, audio_paths, texts, durations, arpabets): """ Featurize a minibatch of audio, zero pad them and return a dictionary Params: audio_paths (list(str)): List of paths to audio files texts (list(str)): List of texts corresponding to the audio files Returns: dict: See below for contents """ assert len(audio_paths) == len(texts),\ "Inputs and outputs to the network must be of the same number" # Features is a list of (timesteps, feature_dim) arrays # Calculate the features for each audio clip, as the log of the # Fourier Transform of the audio features = [self.featurize(a) for a in audio_paths] input_lengths = [f.shape[0] for f in features] max_length = max(input_lengths) feature_dim = features[0].shape[1] mb_size = len(features) # Pad all the inputs so that they are all the same length x = np.zeros((mb_size, max_length, feature_dim)) y = [] label_lengths = [] for i in range(mb_size): feat = features[i] feat = self.normalize(feat) # Center using means and std x[i, :feat.shape[0], :] = feat text = text_normalize(texts[i]) label = text_to_int_sequence(text) y.append(label) label_lengths.append(len(label)) y = pad_sequences(y, maxlen=len(max(texts, key=len)), dtype='int32', padding='post', truncating='post', value=-1) res = { 'x': x, # (0-padded features of shape(mb_size,timesteps,feat_dim) 'y': y, # list(int) Flattened labels (integer sequences) 'texts': texts, # list(str) Original texts 'input_lengths': input_lengths, # list(int) Length of each input 'label_lengths': label_lengths # list(int) Length of each label # 'durations' [if use_durations] list(float) Duration of each sample # 'phonemes'[if use_arpabets] list(int) Flattened arpabet ints } if self.use_durations: res['durations'] = durations if self.use_arpabets: arpints, arpaint_lengths = [], [] for i in range(mb_size): arpaint_seq = arpabet_to_int_sequence(arpabets[i]) arpints.append(arpaint_seq) arpaint_lengths.append(len(arpaint_seq)) maxlen = len(max(arpints, key=len)) res['phonemes'] = pad_sequences(arpints, maxlen=maxlen, dtype='int32', padding='post', truncating='post', value=-1) res['phoneme_lengths'] = arpaint_lengths return res
def evaluation(model, step, device, args): # Evaluation model.eval() with torch.no_grad(): # Preprocessing eval texts print('Start generating evaluation speeches...') n_eval = len(hps.eval_texts) for i in range(n_eval): sys.stdout.write('\rProgress: {}/{}'.format(i + 1, n_eval)) sys.stdout.flush() text = hps.eval_texts[i] text = text_normalize(text) txt_id = sent2idx(text) + [hps.vocab.find('E')] txt_len = len(txt_id) GO_frame = torch.zeros(1, 1, hps.n_mels) # Shape: (1, seq_length) txt = torch.LongTensor([txt_id]) txt_len = torch.LongTensor([txt_len]) if args.cuda: GO_frame = GO_frame.cuda() txt = txt.cuda() txt_len.cuda() _batch = model(text=txt, frames=GO_frame, text_length=txt_len) mel = _batch['mel'][0] mag = _batch['mag'][0] attn = _batch['attn'][0] if args.cuda: mel = mel.cpu() mag = mag.cpu() attn = attn.cpu() mel = mel.numpy() mag = mag.numpy() attn = attn.numpy() wav = mag2wav(mag) save_alignment(attn, step, 'eval/plots/attn_{}.png'.format(text)) save_spectrogram(mag, 'eval/plots/spectrogram_[{}].png'.format(text)) save_wav(wav, 'eval/results/wav_{}.wav'.format(text)) sys.stdout.write('\n')
def run(args): # Check cuda device device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Data if hps.bucket: dataset = LJSpeech_Dataset(meta_file=hps.meta_path, wav_dir=hps.wav_dir, batch_size=hps.batch_size, do_bucket=True, bucket_size=20) loader = DataLoader( dataset, batch_size=1, shuffle=True, num_workers=4) else: dataset = LJSpeech_Dataset(meta_file=hps.meta_path, wav_dir=hps.wav_dir) loader = DataLoader( dataset, batch_size=hps.batch_size, shuffle=True, num_workers=4, drop_last=True, collate_fn=collate_fn) # Network model = Tacotron() criterion = nn.L1Loss() if args.cuda: model = nn.DataParallel(model.to(device)) criterion = criterion.to(device) # The learning rate scheduling mechanism in "Attention is all you need" lr_lambda = lambda step: hps.warmup_step ** 0.5 * min((step+1) * (hps.warmup_step ** -1.5), (step+1) ** -0.5) optimizer = optim.Adam(model.parameters(), lr=hps.lr) scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda) step = 1 epoch = 1 # Load model if args.ckpt: ckpt = load(args.ckpt) step = ckpt['step'] epoch = ckpt['epoch'] model.load_state_dict(ckpt['model']) optimizer.load_state_dict(ckpt['optimizer']) scheduler = optim.lr_scheduler.LambdaLR( optimizer, lr_lambda, last_epoch=step) if args.eval: # Evaluation model.eval() with torch.no_grad(): # Preprocessing eval texts print('Start generating evaluation speeches...') n_eval = len(hps.eval_texts) for i in range(n_eval): sys.stdout.write('\rProgress: {}/{}'.format(i+1, n_eval)) sys.stdout.flush() text = hps.eval_texts[i] text = text_normalize(text) txt_id = sent2idx(text) + [hps.char_set.find('E')] GO_frame = torch.zeros(1, 1, hps.n_mels) # Shape: (1, seq_length) txt = torch.LongTensor(txt_id).unsqueeze(0) if args.cuda: GO_frame = GO_frame.cuda() txt = txt.cuda() _batch = model(text=txt, frames=GO_frame) mel = _batch['mel'][0] mag = _batch['mag'][0] attn = _batch['attn'][0] if args.cuda: mel = mel.cpu() mag = mag.cpu() attn = attn.cpu() mel = mel.numpy() mag = mag.numpy() attn = attn.numpy() wav = mag2wav(mag) save_alignment(attn, step, 'eval/plots/attn_{}.png'.format(text)) save_spectrogram(mag, 'eval/plots/spectrogram_[{}].png'.format(text)) save_wav(wav, 'eval/results/wav_{}.wav'.format(text)) sys.stdout.write('\n') if args.train: before_load = time.time() # Start training model.train() while True: for batch in loader: # torch.LongTensor, (batch_size, seq_length) txt = batch['text'] # torch.Tensor, (batch_size, max_time, hps.n_mels) mel = batch['mel'] # torch.Tensor, (batch_size, max_time, hps.n_fft) mag = batch['mag'] if hps.bucket: # If bucketing, the shape will be (1, batch_size, ...) txt = txt.squeeze(0) mel = mel.squeeze(0) mag = mag.squeeze(0) # GO frame GO_frame = torch.zeros(mel[:, :1, :].size()) if args.cuda: txt = txt.to(device) mel = mel.to(device) mag = mag.to(device) GO_frame = GO_frame.to(device) # Model prediction decoder_input = torch.cat([GO_frame, mel[:, hps.reduction_factor::hps.reduction_factor, :]], dim=1) load_time = time.time() - before_load before_step = time.time() _batch = model(text=txt, frames=decoder_input) _mel = _batch['mel'] _mag = _batch['mag'] _attn = _batch['attn'] # Optimization optimizer.zero_grad() loss_mel = criterion(_mel, mel) loss_mag = criterion(_mag, mag) loss = loss_mel + loss_mag loss.backward() # Gradient clipping total_norm = clip_grad_norm_(model.parameters(), max_norm=hps.clip_norm) # Apply gradient optimizer.step() # Adjust learning rate scheduler.step() process_time = time.time() - before_step if step % hps.log_every_step == 0: lr_curr = optimizer.param_groups[0]['lr'] log = '[{}-{}] loss: {:.3f}, grad: {:.3f}, lr: {:.3e}, time: {:.2f} + {:.2f} sec'.format(epoch, step, loss.item(), total_norm, lr_curr, load_time, process_time) print(log) if step % hps.save_model_every_step == 0: save(filepath='tmp/ckpt/ckpt_{}.pth.tar'.format(step), model=model.state_dict(), optimizer=optimizer.state_dict(), step=step, epoch=epoch) if step % hps.save_result_every_step == 0: sample_idx = random.randint(0, hps.batch_size-1) attn_sample = _attn[sample_idx].detach().cpu().numpy() mag_sample = _mag[sample_idx].detach().cpu().numpy() wav_sample = mag2wav(mag_sample) # Save results save_alignment(attn_sample, step, 'tmp/plots/attn_{}.png'.format(step)) save_spectrogram(mag_sample, 'tmp/plots/spectrogram_{}.png'.format(step)) save_wav(wav_sample, 'tmp/results/wav_{}.wav'.format(step)) before_load = time.time() step += 1 epoch += 1
def load_metadata_from_desc_file(self, desc_file, partition='train', max_duration=10.0): """ Read metadata from the description file (possibly takes long, depending on the filesize) Params: desc_file (str): Path to a JSON-line file that contains labels and paths to the audio files partition (str): One of 'train', 'validation' or 'test' max_duration (float): In seconds, the maximum duration of utterances to train or test on """ logger.info('Reading description file: {} for partition: {}' .format(desc_file, partition)) audio_paths, durations, texts, arpabets = [], [], [], [] with open(desc_file, encoding='utf-8') as json_line_file: for line_num, json_line in enumerate(json_line_file): try: spec = json.loads(json_line) if float(spec['duration']) > max_duration: continue textlen= len(text_to_int_sequence(text_normalize(spec['text']))) speclen= len(spectrogram_from_file(spec['key'])) if textlen > speclen : print('label > feats ignore setence') continue if textlen < 2: print('small label ignore setence') continue audio_paths.append(spec['key']) durations.append(float(spec['duration'])) texts.append(spec['text']) if self.use_arpabets: arpabets.append(spec['arpabet']) except Exception as e: # Change to (KeyError, ValueError) or # (KeyError,json.decoder.JSONDecodeError), depending on # json module version logger.warn('Error reading line #{}: {}' .format(line_num, json_line)) logger.warn(str(e)) if not self.use_arpabets: arpabets = [''] * len(audio_paths) if partition == 'train': self.train_audio_paths = audio_paths self.train_durations = durations self.train_texts = texts self.train_arpabets = arpabets elif partition == 'validation': self.val_audio_paths = audio_paths self.val_durations = durations self.val_texts = texts self.val_arpabets = arpabets elif partition == 'test': self.test_audio_paths = audio_paths self.test_durations = durations self.test_texts = texts self.test_arpabets = arpabets else: raise Exception("Invalid partition to load metadata. " "Must be train/validation/test")