예제 #1
0
 def _make_text_list(self, txt_origin, txt_prepro):
     text_list = []
     for i in range(len(txt_prepro)):
         if type(txt_prepro[i]) is str:
             t = txt_prepro[i]
         else:
             t = txt_origin[i]
         text_list.append(text_normalize(t))
     return text_list
 def prepare_minibatch(self, audio_paths, texts, durations, arpabets):
     """ Featurize a minibatch of audio, zero pad them and return a dictionary
     Params:
         audio_paths (list(str)): List of paths to audio files
         texts (list(str)): List of texts corresponding to the audio files
     Returns:
         dict: See below for contents
     """
     assert len(audio_paths) == len(texts),\
         "Inputs and outputs to the network must be of the same number"
     # Features is a list of (timesteps, feature_dim) arrays
     # Calculate the features for each audio clip, as the log of the
     # Fourier Transform of the audio
     features = [self.featurize(a) for a in audio_paths]
     input_lengths = [f.shape[0] for f in features]
     max_length = max(input_lengths)
     feature_dim = features[0].shape[1]
     mb_size = len(features)
     # Pad all the inputs so that they are all the same length
     x = np.zeros((mb_size, max_length, feature_dim))
     y = []
     label_lengths = []
     for i in range(mb_size):
         feat = features[i]
         feat = self.normalize(feat)  # Center using means and std
         x[i, :feat.shape[0], :] = feat
         text = text_normalize(texts[i])
         label = text_to_int_sequence(text)
         y.append(label)
         label_lengths.append(len(label))
     y = pad_sequences(y, maxlen=len(max(texts, key=len)), dtype='int32',
                       padding='post', truncating='post', value=-1)
     res = {
         'x': x,  # (0-padded features of shape(mb_size,timesteps,feat_dim)
         'y': y,  # list(int) Flattened labels (integer sequences)
         'texts': texts,  # list(str) Original texts
         'input_lengths': input_lengths,  # list(int) Length of each input
         'label_lengths': label_lengths  # list(int) Length of each label
         # 'durations' [if use_durations] list(float) Duration of each sample
         # 'phonemes'[if use_arpabets] list(int) Flattened arpabet ints
     }
     if self.use_durations:
         res['durations'] = durations
     if self.use_arpabets:
         arpints, arpaint_lengths = [], []
         for i in range(mb_size):
             arpaint_seq = arpabet_to_int_sequence(arpabets[i])
             arpints.append(arpaint_seq)
             arpaint_lengths.append(len(arpaint_seq))
         maxlen = len(max(arpints, key=len))
         res['phonemes'] = pad_sequences(arpints, maxlen=maxlen,
                                         dtype='int32', padding='post',
                                         truncating='post', value=-1)
         res['phoneme_lengths'] = arpaint_lengths
     return res
예제 #3
0
def evaluation(model, step, device, args):
    # Evaluation
    model.eval()
    with torch.no_grad():
        # Preprocessing eval texts
        print('Start generating evaluation speeches...')
        n_eval = len(hps.eval_texts)
        for i in range(n_eval):
            sys.stdout.write('\rProgress: {}/{}'.format(i + 1, n_eval))
            sys.stdout.flush()
            text = hps.eval_texts[i]
            text = text_normalize(text)

            txt_id = sent2idx(text) + [hps.vocab.find('E')]
            txt_len = len(txt_id)
            GO_frame = torch.zeros(1, 1, hps.n_mels)

            # Shape: (1, seq_length)
            txt = torch.LongTensor([txt_id])
            txt_len = torch.LongTensor([txt_len])
            if args.cuda:
                GO_frame = GO_frame.cuda()
                txt = txt.cuda()
                txt_len.cuda()
            _batch = model(text=txt, frames=GO_frame, text_length=txt_len)
            mel = _batch['mel'][0]
            mag = _batch['mag'][0]
            attn = _batch['attn'][0]
            if args.cuda:
                mel = mel.cpu()
                mag = mag.cpu()
                attn = attn.cpu()
            mel = mel.numpy()
            mag = mag.numpy()
            attn = attn.numpy()

            wav = mag2wav(mag)
            save_alignment(attn, step, 'eval/plots/attn_{}.png'.format(text))
            save_spectrogram(mag,
                             'eval/plots/spectrogram_[{}].png'.format(text))
            save_wav(wav, 'eval/results/wav_{}.wav'.format(text))
        sys.stdout.write('\n')
예제 #4
0
def run(args):
    # Check cuda device
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # Data
    if hps.bucket:
        dataset = LJSpeech_Dataset(meta_file=hps.meta_path, wav_dir=hps.wav_dir, batch_size=hps.batch_size, do_bucket=True, bucket_size=20)
        loader = DataLoader(
            dataset, 
            batch_size=1,
            shuffle=True,
            num_workers=4)
    else:
        dataset = LJSpeech_Dataset(meta_file=hps.meta_path, wav_dir=hps.wav_dir)
        loader = DataLoader(
            dataset,
            batch_size=hps.batch_size,
            shuffle=True,
            num_workers=4,
            drop_last=True,
            collate_fn=collate_fn)

    # Network
    model = Tacotron()
    criterion = nn.L1Loss()
    if args.cuda:
        model = nn.DataParallel(model.to(device))
        criterion = criterion.to(device)
    # The learning rate scheduling mechanism in "Attention is all you need" 
    lr_lambda = lambda step: hps.warmup_step ** 0.5 * min((step+1) * (hps.warmup_step ** -1.5), (step+1) ** -0.5)
    optimizer = optim.Adam(model.parameters(), lr=hps.lr)
    scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
        
    step = 1
    epoch = 1
    # Load model
    if args.ckpt:
        ckpt = load(args.ckpt)
        step = ckpt['step']
        epoch = ckpt['epoch']
        model.load_state_dict(ckpt['model'])
        optimizer.load_state_dict(ckpt['optimizer'])
        scheduler = optim.lr_scheduler.LambdaLR(
            optimizer, 
            lr_lambda, 
            last_epoch=step)

    if args.eval:
        # Evaluation
        model.eval()
        with torch.no_grad():
			# Preprocessing eval texts
            print('Start generating evaluation speeches...')
            n_eval = len(hps.eval_texts)
            for i in range(n_eval):
                sys.stdout.write('\rProgress: {}/{}'.format(i+1, n_eval))
                sys.stdout.flush()
                text = hps.eval_texts[i]
                text = text_normalize(text)
                txt_id = sent2idx(text) + [hps.char_set.find('E')]
                GO_frame = torch.zeros(1, 1, hps.n_mels)

                # Shape: (1, seq_length)
                txt = torch.LongTensor(txt_id).unsqueeze(0)
                if args.cuda:
                    GO_frame = GO_frame.cuda()
                    txt = txt.cuda()
                _batch = model(text=txt, frames=GO_frame)
                mel = _batch['mel'][0]
                mag = _batch['mag'][0]
                attn = _batch['attn'][0]
                if args.cuda:
               	    mel = mel.cpu()
                    mag = mag.cpu()
                    attn = attn.cpu()
                mel = mel.numpy()
                mag = mag.numpy()
                attn = attn.numpy()

                wav = mag2wav(mag)
                save_alignment(attn, step, 'eval/plots/attn_{}.png'.format(text))
                save_spectrogram(mag, 'eval/plots/spectrogram_[{}].png'.format(text))
                save_wav(wav, 'eval/results/wav_{}.wav'.format(text))
            sys.stdout.write('\n')

    if args.train:
        before_load = time.time()
        # Start training
        model.train()
        while True:
            for batch in loader:
                # torch.LongTensor, (batch_size, seq_length)
                txt = batch['text']
                # torch.Tensor, (batch_size, max_time, hps.n_mels)
                mel = batch['mel']
                # torch.Tensor, (batch_size, max_time, hps.n_fft)
                mag = batch['mag']
                if hps.bucket:
                    # If bucketing, the shape will be (1, batch_size, ...)
                    txt = txt.squeeze(0)
                    mel = mel.squeeze(0)
                    mag = mag.squeeze(0)
                # GO frame
                GO_frame = torch.zeros(mel[:, :1, :].size())
                if args.cuda:
                    txt = txt.to(device)
                    mel = mel.to(device)
                    mag = mag.to(device)
                    GO_frame = GO_frame.to(device)

                # Model prediction
                decoder_input = torch.cat([GO_frame, mel[:, hps.reduction_factor::hps.reduction_factor, :]], dim=1)

                load_time = time.time() - before_load
                before_step = time.time()

                _batch = model(text=txt, frames=decoder_input)
                _mel = _batch['mel']
                _mag = _batch['mag']
                _attn = _batch['attn']

                # Optimization
                optimizer.zero_grad()
                loss_mel = criterion(_mel, mel)
                loss_mag = criterion(_mag, mag)
                loss = loss_mel + loss_mag
                loss.backward()
                # Gradient clipping
                total_norm = clip_grad_norm_(model.parameters(), max_norm=hps.clip_norm)
                # Apply gradient
                optimizer.step()
                # Adjust learning rate
                scheduler.step()
                process_time = time.time() - before_step 
                if step % hps.log_every_step == 0:
                    lr_curr = optimizer.param_groups[0]['lr']
                    log = '[{}-{}] loss: {:.3f}, grad: {:.3f}, lr: {:.3e}, time: {:.2f} + {:.2f} sec'.format(epoch, step, loss.item(), total_norm, lr_curr, load_time, process_time)
                    print(log)
                if step % hps.save_model_every_step == 0:
                    save(filepath='tmp/ckpt/ckpt_{}.pth.tar'.format(step),
                         model=model.state_dict(),
                         optimizer=optimizer.state_dict(),
                         step=step, 
                         epoch=epoch)

                if step % hps.save_result_every_step == 0:
                    sample_idx = random.randint(0, hps.batch_size-1)
                    attn_sample = _attn[sample_idx].detach().cpu().numpy()
                    mag_sample = _mag[sample_idx].detach().cpu().numpy()
                    wav_sample = mag2wav(mag_sample)
                    # Save results
                    save_alignment(attn_sample, step, 'tmp/plots/attn_{}.png'.format(step))
                    save_spectrogram(mag_sample, 'tmp/plots/spectrogram_{}.png'.format(step))
                    save_wav(wav_sample, 'tmp/results/wav_{}.wav'.format(step))
                before_load = time.time()
                step += 1
            epoch += 1
    def load_metadata_from_desc_file(self, desc_file, partition='train',
                                     max_duration=10.0):
        """ Read metadata from the description file
            (possibly takes long, depending on the filesize)
        Params:
            desc_file (str):  Path to a JSON-line file that contains labels and
                paths to the audio files
            partition (str): One of 'train', 'validation' or 'test'
            max_duration (float): In seconds, the maximum duration of
                utterances to train or test on
        """
        logger.info('Reading description file: {} for partition: {}'
                    .format(desc_file, partition))
        audio_paths, durations, texts, arpabets = [], [], [], []
        with open(desc_file, encoding='utf-8') as json_line_file:
            for line_num, json_line in enumerate(json_line_file):
                try:
                    spec = json.loads(json_line)
                    if float(spec['duration']) > max_duration:
                        continue
                    textlen= len(text_to_int_sequence(text_normalize(spec['text'])))
                    speclen= len(spectrogram_from_file(spec['key']))
                    if textlen >  speclen :
                            print('label > feats ignore setence')
                            continue
                    if textlen < 2:
                        print('small label ignore setence')
                        continue
                    audio_paths.append(spec['key'])
                    durations.append(float(spec['duration']))
                    texts.append(spec['text'])
                    if self.use_arpabets:
                        arpabets.append(spec['arpabet'])
                except Exception as e:
                    # Change to (KeyError, ValueError) or
                    # (KeyError,json.decoder.JSONDecodeError), depending on
                    # json module version
                    logger.warn('Error reading line #{}: {}'
                                .format(line_num, json_line))
                    logger.warn(str(e))

        if not self.use_arpabets:
            arpabets = [''] * len(audio_paths)

        if partition == 'train':
            self.train_audio_paths = audio_paths
            self.train_durations = durations
            self.train_texts = texts
            self.train_arpabets = arpabets
        elif partition == 'validation':
            self.val_audio_paths = audio_paths
            self.val_durations = durations
            self.val_texts = texts
            self.val_arpabets = arpabets
        elif partition == 'test':
            self.test_audio_paths = audio_paths
            self.test_durations = durations
            self.test_texts = texts
            self.test_arpabets = arpabets
        else:
            raise Exception("Invalid partition to load metadata. "
                            "Must be train/validation/test")