예제 #1
0
def create_gta_features(model: Tacotron, train_set: DataLoader,
                        val_set: DataLoader, save_path: Path):
    model.eval()
    device = next(
        model.parameters()).device  # use same device as model parameters
    iters = len(train_set) + len(val_set)
    dataset = itertools.chain(train_set, val_set)
    for i, (x, mels, ids, mel_lens, dur) in enumerate(dataset, 1):
        x, mels, dur = x.to(device), mels.to(device), dur.to(device)
        with torch.no_grad():
            _, gta, _ = model(x, mels, dur)
        gta = gta.cpu().numpy()
        for j, item_id in enumerate(ids):
            mel = gta[j][:, :mel_lens[j]]
            mel = (mel + 4) / 8
            np.save(str(save_path / f'{item_id}.npy'), mel, allow_pickle=False)
        bar = progbar(i, iters)
        msg = f'{bar} {i}/{iters} Batches '
        stream(msg)
class Synthesizer(object):
    def load_model(self, model_path, model_name, model_config, use_cuda):
        model_config = os.path.join(model_path, model_config)
        self.model_file = os.path.join(model_path, model_name)
        print(" > Loading model ...")
        print(" | > model config: ", model_config)
        print(" | > model file: ", self.model_file)
        config = load_config(model_config)
        self.config = config
        self.use_cuda = use_cuda
        self.ap = AudioProcessor(**config.audio)
        self.model = Tacotron(61, config.embedding_size, self.ap.num_freq,
                              self.ap.num_mels, config.r)
        # load model state
        if use_cuda:
            cp = torch.load(self.model_file)
        else:
            cp = torch.load(self.model_file,
                            map_location=lambda storage, loc: storage)
        # load the model
        self.model.load_state_dict(cp['model'])
        if use_cuda:
            self.model.cuda()
        self.model.eval()

    def save_wav(self, wav, path):
        # wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
        wav = np.array(wav)
        self.ap.save_wav(wav, path)

    def tts(self, text):
        text_cleaner = [self.config.text_cleaner]
        wavs = []
        for sen in text.split('.'):
            if len(sen) < 3:
                continue
            sen = sen.strip()
            sen += '.'
            print(sen)
            sen = sen.strip()
            seq = np.array(
                phoneme_to_sequence(sen, text_cleaner,
                                    self.config.phoneme_language))
            chars_var = torch.from_numpy(seq).unsqueeze(0).long()
            if self.use_cuda:
                chars_var = chars_var.cuda()
            mel_out, linear_out, alignments, stop_tokens = self.model.forward(
                chars_var)
            linear_out = linear_out[0].data.cpu().numpy()
            wav = self.ap.inv_spectrogram(linear_out.T)
            wavs += list(wav)
            wavs += [0] * 10000

        out = io.BytesIO()
        self.save_wav(wavs, out)

        return out
예제 #3
0
    def test_train_step(self):
        input = torch.randint(0, 24, (8, 128)).long().to(device)
        input_lengths = torch.randint(100, 129, (8, )).long().to(device)
        input_lengths[-1] = 128
        mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
        linear_spec = torch.rand(8, 30, c.audio['num_freq']).to(device)
        mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
        stop_targets = torch.zeros(8, 30, 1).float().to(device)

        for idx in mel_lengths:
            stop_targets[:, int(idx.item()):, 0] = 1.0

        stop_targets = stop_targets.view(input.shape[0],
                                         stop_targets.size(1) // c.r, -1)
        stop_targets = (stop_targets.sum(2) >
                        0.0).unsqueeze(2).float().squeeze()

        criterion = L1LossMasked().to(device)
        criterion_st = nn.BCEWithLogitsLoss().to(device)
        model = Tacotron(32,
                         c.audio['num_freq'],
                         c.audio['num_mels'],
                         c.r,
                         memory_size=c.memory_size).to(device)
        model.train()
        model_ref = copy.deepcopy(model)
        count = 0
        for param, param_ref in zip(model.parameters(),
                                    model_ref.parameters()):
            assert (param - param_ref).sum() == 0, param
            count += 1
        optimizer = optim.Adam(model.parameters(), lr=c.lr)
        for i in range(5):
            mel_out, linear_out, align, stop_tokens = model.forward(
                input, input_lengths, mel_spec)
            optimizer.zero_grad()
            loss = criterion(mel_out, mel_spec, mel_lengths)
            stop_loss = criterion_st(stop_tokens, stop_targets)
            loss = loss + criterion(linear_out, linear_spec,
                                    mel_lengths) + stop_loss
            loss.backward()
            optimizer.step()
        # check parameter changes
        count = 0
        for param, param_ref in zip(model.parameters(),
                                    model_ref.parameters()):
            # ignore pre-higway layer since it works conditional
            # if count not in [145, 59]:
            assert (param != param_ref).any(
            ), "param {} with shape {} not updated!! \n{}\n{}".format(
                count, param.shape, param, param_ref)
            count += 1
예제 #4
0
def create_align_features(model: Tacotron, train_set: DataLoader,
                          val_set: DataLoader, save_path_alg: Path,
                          save_path_pitch: Path):
    assert model.r == 1, f'Reduction factor of tacotron must be 1 for creating alignment features! ' \
                         f'Reduction factor was: {model.r}'
    model.eval()
    device = next(
        model.parameters()).device  # use same device as model parameters
    iters = len(val_set) + len(train_set)
    dataset = itertools.chain(train_set, val_set)
    att_score_dict = {}

    if hp.extract_durations_with_dijkstra:
        print('Extracting durations using dijkstra...')
        dur_extraction_func = extract_durations_with_dijkstra
    else:
        print('Extracting durations using attention peak counts...')
        dur_extraction_func = extract_durations_per_count

    for i, (x, mels, ids, x_lens, mel_lens) in enumerate(dataset, 1):
        x, mels = x.to(device), mels.to(device)
        with torch.no_grad():
            _, _, att_batch = model(x, mels)
        align_score, sharp_score = attention_score(att_batch, mel_lens, r=1)
        att_batch = np_now(att_batch)
        seq, att, mel_len, item_id = x[0], att_batch[0], mel_lens[0], ids[0]
        align_score, sharp_score = float(align_score[0]), float(sharp_score[0])
        att_score_dict[item_id] = (align_score, sharp_score)
        durs = dur_extraction_func(seq, att, mel_len)
        if np.sum(durs) != mel_len:
            print(
                f'WARNINNG: Sum of durations did not match mel length for item {item_id}!'
            )
        np.save(str(save_path_alg / f'{item_id}.npy'),
                durs,
                allow_pickle=False)
        bar = progbar(i, iters)
        msg = f'{bar} {i}/{iters} Batches '
        stream(msg)
    pickle_binary(att_score_dict, paths.data / 'att_score_dict.pkl')
    print('Extracting Pitch Values...')
    extract_pitch(save_path_pitch)
예제 #5
0
def tacotron(pretrained=True, **kwargs):
    model = Tacotron(embed_dims=hp.tts_embed_dims,
                     num_chars=len(symbols),
                     encoder_dims=hp.tts_encoder_dims,
                     decoder_dims=hp.tts_decoder_dims,
                     n_mels=hp.num_mels,
                     fft_bins=hp.num_mels,
                     postnet_dims=hp.tts_postnet_dims,
                     encoder_K=hp.tts_encoder_K,
                     lstm_dims=hp.tts_lstm_dims,
                     postnet_K=hp.tts_postnet_K,
                     num_highways=hp.tts_num_highways,
                     dropout=hp.tts_dropout,
                     stop_threshold=hp.tts_stop_threshold)
    if pretrained:
        state_dict = fetch_and_load_state_dict("tacotron")
        state_dict["decoder.r"] = state_dict.pop("r")
        state_dict["stop_threshold"] = torch.tensor(hp.tts_stop_threshold,
                                                    dtype=torch.float32)
        model.load_state_dict(state_dict)
    return model
예제 #6
0
def train(args):
    save_dir = './checkpoint/1'
    checkpoint_path = os.path.join(save_dir, 'model.ckpt')

    coord = tf.train.Coordinator()
    feeder = DataFeeder(coord, mode=1)

    model = Tacotron()
    model.initialize(feeder.enc_input, feeder.sequence_length,
                     feeder.dec_input, feeder.mel_target)
    saver = tf.train.Saver(max_to_keep=5, keep_checkpoint_every_n_hours=2)

    with tf.Session() as sess:
        try:
            sess.run(tf.global_variables_initializer())

            if args.step:
                load_dir = '{}-{}'.format(checkpoint_path, args.step)
                saver.restore(sess, load_dir)

            feeder.start_in_session(sess)

            while not coord.should_stop():
                step, loss, optimize = sess.run(
                    [model.global_step, model.loss, model.optimize])
                print('Step: {}, Loss: {:.5f}'.format(step, loss))

                if step % checkpoint_step == 0:
                    saver.save(sess, checkpoint_path, global_step=step)
                    input_seq, alignment, pred, target = \
                        sess.run([model.enc_input[0], model.alignment[0], model.mel_output[0], model.mel_target[0]])

                    input_seq = sequence_to_text(input_seq)
                    alignment_dir = os.path.join(
                        save_dir, 'step-{}-align.png'.format(step))
                    plot_alignment(alignment, alignment_dir, input_seq)

        except Exception as e:
            traceback.print_exc()
            coord.request_stop(e)
예제 #7
0
class Synthesizer:
    def load(self, checkpoint_path, model_name='tacotron'):
        print('Constructing model: %s' % model_name)
        inputs = tf.placeholder(tf.int32, [1, None], 'inputs')
        input_lengths = tf.placeholder(tf.int32, [1], 'input_lengths')
        with tf.variable_scope('model') as scope:
            self.model = Tacotron(hparams)
            self.model.initialize(inputs, input_lengths)
            pprint('>>> Model Linear Ouputs:')
            pprint(self.model.linear_outputs[0])
            self.wav_output = audio.inv_spectrogram_tensorflow(
                self.model.linear_outputs[0])

        print('Loading checkpoint: %s' % checkpoint_path)
        self.session = tf.Session()
        self.session.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        saver.restore(self.session, checkpoint_path)

    def synthesize(self, text):
        cleaner_names = [x.strip() for x in hparams.cleaners.split(',')]
        seq = text_to_sequence(text, cleaner_names)
        pprint('Text: ' + text)
        #pprint('Seq')
        #pprint(seq)
        feed_dict = {
            self.model.inputs: [np.asarray(seq, dtype=np.int32)],
            self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
        }
        pprint(self.wav_output)
        pprint('>>> Getting wav')
        wav = self.session.run(self.wav_output, feed_dict=feed_dict)
        pprint('>>> Gotten wav')
        #wav = audio.inv_preemphasis(wav)
        # The audio is typically ~13 seconds unless truncated:
        #wav = wav[:audio.find_endpoint(wav)]
        out = io.BytesIO()
        audio.save_wav(wav, out)
        return out.getvalue()
 def load_model(self, model_path, model_name, model_config, use_cuda):
     model_config = os.path.join(model_path, model_config)
     self.model_file = os.path.join(model_path, model_name)
     print(" > Loading model ...")
     print(" | > model config: ", model_config)
     print(" | > model file: ", self.model_file)
     config = load_config(model_config)
     self.config = config
     self.use_cuda = use_cuda
     self.ap = AudioProcessor(**config.audio)
     self.model = Tacotron(61, config.embedding_size, self.ap.num_freq,
                           self.ap.num_mels, config.r)
     # load model state
     if use_cuda:
         cp = torch.load(self.model_file)
     else:
         cp = torch.load(self.model_file,
                         map_location=lambda storage, loc: storage)
     # load the model
     self.model.load_state_dict(cp['model'])
     if use_cuda:
         self.model.cuda()
     self.model.eval()
예제 #9
0
 def train(self, model: Tacotron, optimizer: Optimizer) -> None:
     for i, session_params in enumerate(hp.tts_schedule, 1):
         r, lr, max_step, bs = session_params
         if model.get_step() < max_step:
             train_set, val_set = get_tts_datasets(path=self.paths.data,
                                                   batch_size=bs,
                                                   r=r,
                                                   model_type='tacotron')
             session = TTSSession(index=i,
                                  r=r,
                                  lr=lr,
                                  max_step=max_step,
                                  bs=bs,
                                  train_set=train_set,
                                  val_set=val_set)
             self.train_session(model, optimizer, session)
예제 #10
0
def create_align_features(model: Tacotron, train_set, save_path: Path):
    assert model.r == 1, f'Reduction factor of tacotron must be 1 for creating alignment features! ' \
                         f'Reduction factor was: {model.r}'
    device = next(model.parameters()).device  # use same device as model parameters
    iters = len(train_set)
    for i, (x, mels, ids, mel_lens, _) in enumerate(train_set, 1):
        x, mels = x.to(device), mels.to(device)
        with torch.no_grad():
            _, _, attn = model(x, mels)
        attn = np_now(attn)
        bs, chars = attn.shape[0], attn.shape[2]
        argmax = np.argmax(attn[:, :, :], axis=2)
        mel_counts = np.zeros(shape=(bs, chars), dtype=np.int32)
        for b in range(attn.shape[0]):
            count = np.bincount(argmax[b, :])
            mel_counts[b, :len(count)] = count[:len(count)]
        for j, item_id in enumerate(ids):
            np.save(save_path / f'{item_id}.npy', mel_counts[j, :], allow_pickle=False)
        bar = progbar(i, iters)
        msg = f'{bar} {i}/{iters} Batches '
        stream(msg)
예제 #11
0
 def train(self, model: Tacotron, optimizer: Optimizer) -> None:
     tts_schedule = self.train_cfg['schedule']
     tts_schedule = parse_schedule(tts_schedule)
     for i, session_params in enumerate(tts_schedule, 1):
         r, lr, max_step, bs = session_params
         if model.get_step() < max_step:
             train_set, val_set = get_tts_datasets(
                 path=self.paths.data,
                 batch_size=bs,
                 r=r,
                 model_type='tacotron',
                 max_mel_len=self.train_cfg['max_mel_len'],
                 filter_attention=False)
             session = TTSSession(index=i,
                                  r=r,
                                  lr=lr,
                                  max_step=max_step,
                                  bs=bs,
                                  train_set=train_set,
                                  val_set=val_set)
             self.train_session(model, optimizer, session=session)
예제 #12
0
def create_gta_features(model: Tacotron, train_set, save_path: Path):
    save_path.mkdir(parents=False, exist_ok=True)
    device = next(
        model.parameters()).device  # use same device as model parameters

    iters = len(train_set)

    for i, (x, mels, ids, mel_lens) in enumerate(train_set, 1):

        x, mels = x.to(device), mels.to(device)

        with torch.no_grad():
            _, gta, _ = model(x, mels)

        gta = gta.cpu().numpy()

        for j, item_id in enumerate(ids):
            mel = gta[j][:, :mel_lens[j]]
            mel = (mel + 4) / 8
            np.save(save_path / f'{item_id}.npy', mel, allow_pickle=False)

        bar = progbar(i, iters)
        msg = f'{bar} {i}/{iters} Batches '
        stream(msg)
예제 #13
0
def main(args):
    # DISTRUBUTED
    if num_gpus > 1:
        init_distributed(args.rank, num_gpus, args.group_id,
                         c.distributed["backend"], c.distributed["url"])
    num_chars = len(phonemes) if c.use_phonemes else len(symbols)
    model = Tacotron(num_chars=num_chars,
                     embedding_dim=c.embedding_size,
                     linear_dim=ap.num_freq,
                     mel_dim=ap.num_mels,
                     r=c.r,
                     memory_size=c.memory_size)

    optimizer = optim.Adam(model.parameters(), lr=c.lr, weight_decay=0)
    optimizer_st = optim.Adam(model.decoder.stopnet.parameters(),
                              lr=c.lr,
                              weight_decay=0)

    criterion = L1LossMasked()
    criterion_st = nn.BCELoss()

    if args.restore_path:
        checkpoint = torch.load(args.restore_path)
        try:
            model.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
        except:
            print(" > Partial model initialization.")
            partial_init_flag = True
            model_dict = model.state_dict()
            # Partial initialization: if there is a mismatch with new and old layer, it is skipped.
            # 1. filter out unnecessary keys
            pretrained_dict = {
                k: v
                for k, v in checkpoint['model'].items() if k in model_dict
            }
            # 2. filter out different size layers
            pretrained_dict = {
                k: v
                for k, v in pretrained_dict.items()
                if v.numel() == model_dict[k].numel()
            }
            # 3. overwrite entries in the existing state dict
            model_dict.update(pretrained_dict)
            # 4. load the new state dict
            model.load_state_dict(model_dict)
            print(" | > {} / {} layers are initialized".format(
                len(pretrained_dict), len(model_dict)))
        if use_cuda:
            model = model.cuda()
            criterion.cuda()
            criterion_st.cuda()
        for group in optimizer.param_groups:
            group['lr'] = c.lr
        print(" > Model restored from step %d" % checkpoint['step'],
              flush=True)
        start_epoch = checkpoint['epoch']
        best_loss = checkpoint['linear_loss']
        args.restore_step = checkpoint['step']
    else:
        args.restore_step = 0
        if use_cuda:
            model = model.cuda()
            criterion.cuda()
            criterion_st.cuda()

    # DISTRUBUTED
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)

    if c.lr_decay:
        scheduler = NoamLR(optimizer,
                           warmup_steps=c.warmup_steps,
                           last_epoch=args.restore_step - 1)
    else:
        scheduler = None

    num_params = count_parameters(model)
    print("\n > Model has {} parameters".format(num_params), flush=True)

    if 'best_loss' not in locals():
        best_loss = float('inf')

    for epoch in range(0, c.epochs):
        train_loss, current_step = train(model, criterion, criterion_st,
                                         optimizer, optimizer_st, scheduler,
                                         ap, epoch)
        val_loss = evaluate(model, criterion, criterion_st, ap, current_step,
                            epoch)
        print(" | > Training Loss: {:.5f}   Validation Loss: {:.5f}".format(
            train_loss, val_loss),
              flush=True)
        target_loss = train_loss
        if c.run_eval:
            target_loss = val_loss
        best_loss = save_best_model(model, optimizer, target_loss, best_loss,
                                    OUT_PATH, current_step, epoch)
예제 #14
0
파일: eval.py 프로젝트: geneing/TTS
class Synthesizer(object):
    def load_model(self, model_path, model_config, wavernn_path, use_cuda):
        
        self.model_file = model_path
        print(" > Loading model ...")
        print(" | > model config: ", model_config)
        print(" | > model file: ", self.model_file)
        config = load_config(model_config)
        self.config = config
        self.use_cuda = use_cuda
        self.use_phonemes = config.use_phonemes
        self.ap = AudioProcessor(**config.audio)
        
        if self.use_phonemes:
            self.input_size = len(phonemes)
            self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.config.text_cleaner], self.config.phoneme_language)
        else:
            self.input_size = len(symbols)
            self.input_adapter = lambda sen: text_to_sequence(sen, [self.config.text_cleaner])
        
        self.model = Tacotron(self.input_size, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r, attn_windowing=True)
        self.model.decoder.max_decoder_steps = 8000
        # load model state
        if use_cuda:
            cp = torch.load(self.model_file)
        else:
            cp = torch.load(self.model_file, map_location=lambda storage, loc: storage)
        # load the model
        self.model.load_state_dict(cp['model'])
        if use_cuda:
            self.model.cuda()
        self.model.eval()
        self.vocoder=WaveRNNVocoder.Vocoder()
        self.vocoder.loadWeights(wavernn_path)
        self.firwin = signal.firwin(1025, [65, 7600], pass_zero=False, fs=16000)


    def save_wav(self, wav, path):
        # wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
        wav = np.array(wav)
        self.ap.save_wav(wav, path)

    #split text into chunks that are smaller than maxlen. Preferably, split on punctuation.

    def ttmel(self, text):
        mel_ret = []
        text_list = split_text(text, maxlen)
        for t in text_list:
            if len(t) < 3:
                continue
            seq = np.array(self.input_adapter(t))
            
            chars_var = torch.from_numpy(seq).unsqueeze(0).long()
            if self.use_cuda:
                chars_var = chars_var.cuda()
            mel_out, _, alignments, stop_tokens = self.model.forward(chars_var)
            mel_out = mel_out[0].data.cpu().numpy().T
            mel_ret.append(mel_out)
        return np.hstack(mel_ret)

    def tts(self, mel):
        wav = self.vocoder.melToWav(mel)
        return wav
        loss = tf.reduce_mean(MAE(dec_target, pred))
    variables = model.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return loss, pred[0], alignment[0]


dataset = tf.data.Dataset.from_generator(generator=DataGenerator,
                                         output_types=(tf.float32, tf.float32, tf.float32, tf.int32),
                                         output_shapes=(tf.TensorShape([batch_size, None]),
                                                        tf.TensorShape([batch_size, None, mel_dim]),
                                                        tf.TensorShape([batch_size, None, mel_dim]),
                                                        tf.TensorShape([batch_size])))\
    .prefetch(tf.data.experimental.AUTOTUNE)

model = Tacotron(K=16, conv_dim=[128, 128])
optimizer = Adam()
step = tf.Variable(0)

checkpoint_dir = './checkpoint/1'
os.makedirs(checkpoint_dir, exist_ok=True)
checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model, step=step)
manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=5)

checkpoint.restore(manager.latest_checkpoint)
if manager.latest_checkpoint:
    print('Restore checkpoint from {}'.format(manager.latest_checkpoint))

try:
    for text, dec, mel, text_length in dataset:
        loss, pred, alignment = train_step(text, dec, mel, text_length)
예제 #16
0
                      res_blocks=hp.voc_res_blocks,
                      hop_length=hp.hop_length,
                      sample_rate=hp.sample_rate).cuda()

    voc_model.restore('quick_start/voc_weights/latest_weights.pyt')

    print('\nInitialising Tacotron Model...\n')

    # Instantiate Tacotron Model
    tts_model = Tacotron(r=hp.tts_r,
                         embed_dims=hp.tts_embed_dims,
                         num_chars=len(symbols),
                         encoder_dims=hp.tts_encoder_dims,
                         decoder_dims=hp.tts_decoder_dims,
                         n_mels=hp.num_mels,
                         fft_bins=hp.num_mels,
                         postnet_dims=hp.tts_postnet_dims,
                         encoder_K=hp.tts_encoder_K,
                         lstm_dims=hp.tts_lstm_dims,
                         postnet_K=hp.tts_postnet_K,
                         num_highways=hp.tts_num_highways,
                         dropout=hp.tts_dropout).cuda()

    tts_model.restore('quick_start/tts_weights/latest_weights.pyt')

    if input_text:
        inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)]
    else:
        with open('sentences.txt') as f:
            inputs = [
                text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f
예제 #17
0
                      res_out_dims=hp.voc_res_out_dims,
                      res_blocks=hp.voc_res_blocks,
                      hop_length=hp.hop_length,
                      sample_rate=hp.sample_rate).cuda()

    voc_model.restore(paths.voc_latest_weights)

    print('\nInitialising Tacotron Model...\n')

    # Instantiate Tacotron Model
    tts_model = Tacotron(embed_dims=hp.tts_embed_dims,
                         num_chars=len(symbols),
                         encoder_dims=hp.tts_encoder_dims,
                         decoder_dims=hp.tts_decoder_dims,
                         n_mels=hp.num_mels,
                         fft_bins=hp.num_mels,
                         postnet_dims=hp.tts_postnet_dims,
                         encoder_K=hp.tts_encoder_K,
                         lstm_dims=hp.tts_lstm_dims,
                         postnet_K=hp.tts_postnet_K,
                         num_highways=hp.tts_num_highways,
                         dropout=hp.tts_dropout).cuda()

    tts_restore_path = weights_path if weights_path else paths.tts_latest_weights
    tts_model.restore(tts_restore_path)

    if input_text:
        inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)]
    else:
        with open('sentences.txt') as f:
            inputs = [
                text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f
예제 #18
0
파일: train.py 프로젝트: wurde/TTS
def main(args):
    model = Tacotron(c.embedding_size, ap.num_freq, ap.num_mels, c.r)
    print(" | > Num output units : {}".format(ap.num_freq), flush=True)

    optimizer = optim.Adam(model.parameters(), lr=c.lr, weight_decay=0)
    optimizer_st = optim.Adam(
        model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0)

    criterion = L1LossMasked()
    criterion_st = nn.BCELoss()

    if args.restore_path:
        checkpoint = torch.load(args.restore_path)
        try:
            model.load_state_dict(checkpoint['model'])
        except:
            model_dict = model.state_dict()
            # Partial initialization: if there is a mismatch with new and old layer, it is skipped.
            # 1. filter out unnecessary keys
            pretrained_dict = {
                k: v
                for k, v in checkpoint['model'].items() if k in model_dict
            }
            # 2. overwrite entries in the existing state dict
            model_dict.update(pretrained_dict)
            # 3. load the new state dict
            model.load_state_dict(model_dict)
        if use_cuda:
            model = model.cuda()
            criterion.cuda()
            criterion_st.cuda()
        optimizer.load_state_dict(checkpoint['optimizer'])
        print(
            " > Model restored from step %d" % checkpoint['step'], flush=True)
        start_epoch = checkpoint['epoch']
        best_loss = checkpoint['linear_loss']
        args.restore_step = checkpoint['step']
    else:
        args.restore_step = 0
        print("\n > Starting a new training", flush=True)
        if use_cuda:
            model = model.cuda()
            criterion.cuda()
            criterion_st.cuda()

    if c.lr_decay:
        scheduler = NoamLR(
            optimizer,
            warmup_steps=c.warmup_steps,
            last_epoch=args.restore_step - 1)
    else:
        scheduler = None

    num_params = count_parameters(model)
    print(" | > Model has {} parameters".format(num_params), flush=True)

    if not os.path.exists(CHECKPOINT_PATH):
        os.mkdir(CHECKPOINT_PATH)

    if 'best_loss' not in locals():
        best_loss = float('inf')

    for epoch in range(0, c.epochs):
        train_loss, current_step = train(model, criterion, criterion_st,
                                         optimizer, optimizer_st,
                                         scheduler, ap, epoch)
        val_loss = evaluate(model, criterion, criterion_st, ap,
                            current_step)
        print(
            " | > Train Loss: {:.5f}   Validation Loss: {:.5f}".format(
                train_loss, val_loss),
            flush=True)
        best_loss = save_best_model(model, optimizer, train_loss, best_loss,
                                    OUT_PATH, current_step, epoch)
예제 #19
0
                        hop_length=hp.hop_length,
                        sample_rate=hp.sample_rate,
                        mode='MOL').to(device)

    voc_model.load('quick_start/voc_weights/latest_weights.pyt')

    print('\nInitialising Tacotron Model...\n')

    # Instantiate Tacotron Model
    tts_model = Tacotron(embed_dims=hp.tts_embed_dims,
                         num_chars=len(symbols),
                         encoder_dims=hp.tts_encoder_dims,
                         decoder_dims=hp.tts_decoder_dims,
                         n_mels=hp.num_mels,
                         fft_bins=hp.num_mels,
                         postnet_dims=hp.tts_postnet_dims,
                         encoder_K=hp.tts_encoder_K,
                         lstm_dims=hp.tts_lstm_dims,
                         postnet_K=hp.tts_postnet_K,
                         num_highways=hp.tts_num_highways,
                         dropout=hp.tts_dropout,
                         stop_threshold=hp.tts_stop_threshold).to(device)

    tts_model.load('quick_start/tts_weights/latest_weights.pyt')

    if input_text:
        inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)]
    else:
        with open('sentences.txt') as f:
            inputs = [
                text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f
예제 #20
0
def main():
    # Parse Arguments
    parser = argparse.ArgumentParser(description='Train Tacotron TTS')
    parser.add_argument('--force_train',
                        '-f',
                        action='store_true',
                        help='Forces the model to train past total steps')
    parser.add_argument('--force_gta',
                        '-g',
                        action='store_true',
                        help='Force the model to create GTA features')
    parser.add_argument(
        '--force_cpu',
        '-c',
        action='store_true',
        help='Forces CPU-only training, even when in CUDA capable environment')
    parser.add_argument('--hp_file',
                        metavar='FILE',
                        default='hparams.py',
                        help='The file to use for the hyperparameters')
    args = parser.parse_args()

    hp.configure(args.hp_file)  # Load hparams from file
    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    force_train = args.force_train
    force_gta = args.force_gta

    if not args.force_cpu and torch.cuda.is_available():
        device = torch.device('cuda')
        for session in hp.tts_schedule:
            _, _, _, batch_size = session
            if batch_size % torch.cuda.device_count() != 0:
                raise ValueError(
                    '`batch_size` must be evenly divisible by n_gpus!')
    else:
        device = torch.device('cpu')
    print('Using device:', device)

    # Instantiate Tacotron Model
    print('\nInitialising Tacotron Model...\n')
    model = Tacotron(embed_dims=hp.tts_embed_dims,
                     num_chars=len(symbols),
                     encoder_dims=hp.tts_encoder_dims,
                     decoder_dims=hp.tts_decoder_dims,
                     n_mels=hp.num_mels,
                     fft_bins=hp.num_mels,
                     postnet_dims=hp.tts_postnet_dims,
                     encoder_K=hp.tts_encoder_K,
                     lstm_dims=hp.tts_lstm_dims,
                     postnet_K=hp.tts_postnet_K,
                     num_highways=hp.tts_num_highways,
                     dropout=hp.tts_dropout,
                     stop_threshold=hp.tts_stop_threshold).to(device)

    optimizer = optim.Adam(model.parameters())
    restore_checkpoint('tts', paths, model, optimizer, create_if_missing=True)

    if not force_gta:
        for i, session in enumerate(hp.tts_schedule):
            current_step = model.get_step()

            r, lr, max_step, batch_size = session

            training_steps = max_step - current_step

            # Do we need to change to the next session?
            if current_step >= max_step:
                # Are there no further sessions than the current one?
                if i == len(hp.tts_schedule) - 1:
                    # There are no more sessions. Check if we force training.
                    if force_train:
                        # Don't finish the loop - train forever
                        training_steps = 999_999_999
                    else:
                        # We have completed training. Breaking is same as continue
                        break
                else:
                    # There is a following session, go to it
                    continue

            model.r = r

            simple_table([('Steps with r=%s' % (repr1(r)),
                           str(training_steps // 1000) + 'k Steps'),
                          ('Batch Size', batch_size), ('Learning Rate', lr),
                          ('Outputs/Step (r)', model.r)])

            train_set, attn_example = get_tts_datasets(paths.data, batch_size,
                                                       r)
            tts_train_loop(paths, model, optimizer, train_set, lr,
                           training_steps, attn_example)

        print('Training Complete.')
        print(
            'To continue training increase tts_total_steps in hparams.py or use --force_train\n'
        )

    print('Creating Ground Truth Aligned Dataset...\n')

    train_set, attn_example = get_tts_datasets(paths.data, 8, model.r)
    create_gta_features(model, train_set, paths.gta)

    print(
        '\n\nYou can now train WaveRNN on GTA features - use python train_wavernn.py --gta\n'
    )
예제 #21
0
    def train_session(self, model: Tacotron, optimizer: Optimizer,
                      session: TTSSession) -> None:
        current_step = model.get_step()
        training_steps = session.max_step - current_step
        total_iters = len(session.train_set)
        epochs = training_steps // total_iters + 1
        model.r = session.r
        simple_table([(f'Steps with r={session.r}',
                       str(training_steps // 1000) + 'k Steps'),
                      ('Batch Size', session.bs),
                      ('Learning Rate', session.lr),
                      ('Outputs/Step (r)', model.r)])
        for g in optimizer.param_groups:
            g['lr'] = session.lr

        loss_avg = Averager()
        duration_avg = Averager()
        device = next(
            model.parameters()).device  # use same device as model parameters
        for e in range(1, epochs + 1):
            for i, (x, m, ids, x_lens,
                    mel_lens) in enumerate(session.train_set, 1):
                start = time.time()
                model.train()
                x, m = x.to(device), m.to(device)

                m1_hat, m2_hat, attention = model(x, m)

                m1_loss = F.l1_loss(m1_hat, m)
                m2_loss = F.l1_loss(m2_hat, m)
                loss = m1_loss + m2_loss
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               hp.tts_clip_grad_norm)
                optimizer.step()
                loss_avg.add(loss.item())
                step = model.get_step()
                k = step // 1000

                duration_avg.add(time.time() - start)
                speed = 1. / duration_avg.get()
                msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {loss_avg.get():#.4} ' \
                      f'| {speed:#.2} steps/s | Step: {k}k | '

                if step % hp.tts_checkpoint_every == 0:
                    ckpt_name = f'taco_step{k}K'
                    save_checkpoint('tts',
                                    self.paths,
                                    model,
                                    optimizer,
                                    name=ckpt_name,
                                    is_silent=True)

                if step % hp.tts_plot_every == 0:
                    self.generate_plots(model, session)

                _, att_score = attention_score(attention, mel_lens)
                att_score = torch.mean(att_score)
                self.writer.add_scalar('Attention_Score/train', att_score,
                                       model.get_step())
                self.writer.add_scalar('Loss/train', loss, model.get_step())
                self.writer.add_scalar('Params/reduction_factor', session.r,
                                       model.get_step())
                self.writer.add_scalar('Params/batch_size', session.bs,
                                       model.get_step())
                self.writer.add_scalar('Params/learning_rate', session.lr,
                                       model.get_step())

                stream(msg)

            val_loss, val_att_score = self.evaluate(model, session.val_set)
            self.writer.add_scalar('Loss/val', val_loss, model.get_step())
            self.writer.add_scalar('Attention_Score/val', val_att_score,
                                   model.get_step())
            save_checkpoint('tts',
                            self.paths,
                            model,
                            optimizer,
                            is_silent=True)

            loss_avg.reset()
            duration_avg.reset()
            print(' ')
예제 #22
0
                            sample_rate=hp.sample_rate,
                            mode=hp.voc_mode).to(device)

        voc_load_path = args.voc_weights if args.voc_weights else paths.voc_latest_weights
        voc_model.load(voc_load_path)

    print('\nInitialising Tacotron Model...\n')

    # Instantiate Tacotron Model
    tts_model = Tacotron(embed_dims=hp.tts_embed_dims,
                         num_chars=len(phonemes),
                         encoder_dims=hp.tts_encoder_dims,
                         decoder_dims=hp.tts_decoder_dims,
                         n_mels=hp.num_mels,
                         fft_bins=hp.num_mels,
                         postnet_dims=hp.tts_postnet_dims,
                         encoder_K=hp.tts_encoder_K,
                         lstm_dims=hp.tts_lstm_dims,
                         postnet_K=hp.tts_postnet_K,
                         num_highways=hp.tts_num_highways,
                         dropout=hp.tts_dropout,
                         stop_threshold=hp.tts_stop_threshold).to(device)

    tts_load_path = tts_weights if tts_weights else paths.tts_latest_weights
    tts_model.load(tts_load_path)

    if input_text:
        text = clean_text(input_text.strip())
        inputs = [text_to_sequence(text)]
    else:
        with open('sentences.txt') as f:
예제 #23
0
def tts_train_loop(paths: Paths, model: Tacotron, optimizer, train_set, lr,
                   train_steps, attn_example):
    device = next(
        model.parameters()).device  # use same device as model parameters

    for g in optimizer.param_groups:
        g['lr'] = lr

    total_iters = len(train_set)
    epochs = train_steps // total_iters + 1

    for e in range(1, epochs + 1):

        start = time.time()
        running_loss = 0

        # Perform 1 epoch
        for i, (x, m, ids, _) in enumerate(train_set, 1):

            x, m = x.to(device), m.to(device)

            # Parallelize model onto GPUS using workaround due to python bug
            if device.type == 'cuda' and torch.cuda.device_count() > 1:
                m1_hat, m2_hat, attention = data_parallel_workaround(
                    model, x, m)
            else:
                m1_hat, m2_hat, attention = model(x, m)

            m1_loss = F.l1_loss(m1_hat, m)
            m2_loss = F.l1_loss(m2_hat, m)

            loss = m1_loss + m2_loss

            optimizer.zero_grad()
            loss.backward()
            if hp.tts_clip_grad_norm is not None:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hp.tts_clip_grad_norm)
                if np.isnan(grad_norm):
                    print('grad_norm was NaN!')

            optimizer.step()

            running_loss += loss.item()
            avg_loss = running_loss / i

            speed = i / (time.time() - start)

            step = model.get_step()
            k = step // 1000

            if step % hp.tts_checkpoint_every == 0:
                ckpt_name = 'taco_step%sK' % (repr1(k))
                save_checkpoint('tts',
                                paths,
                                model,
                                optimizer,
                                name=ckpt_name,
                                is_silent=True)

            if attn_example in ids:
                idx = ids.index(attn_example)
                save_attention(np_now(attention[idx][:, :160]),
                               paths.tts_attention / '%s' % (repr1(step)))
                save_spectrogram(np_now(m2_hat[idx]),
                                 paths.tts_mel_plot / '%s' % (repr1(step)),
                                 600)

            msg = '| Epoch: %s/%s (%s/%s) | Loss: %.4f | %.2f steps/s | Step: %sk | ' % (
                repr1(e), repr1(epochs), repr1(i), repr1(total_iters),
                avg_loss, speed, repr1(k))
            stream(msg)

        # Must save latest optimizer state to ensure that resuming training
        # doesn't produce artifacts
        save_checkpoint('tts', paths, model, optimizer, is_silent=True)
        model.log(paths.tts_log, msg)
        print(' ')
예제 #24
0
def thak():
    class Tshamsoo():
        force_cpu = os.getenv('FORCE_CPU', False)
        hp_file = 'hparams.py'
        vocoder = os.getenv('VOCODER', 'wavernn')
        batched = os.getenv('BATCHED', True)
        target = os.getenv('TARGET', None)
        overlap = os.getenv('OVERLAP', None)
        tts_weights = None
        save_attn = os.getenv('SAVE_ATTN', False)
        voc_weights = None
        iters = os.getenv('GL_ITERS', 32)

    args = Tshamsoo()
    if args.vocoder in ['griffinlim', 'gl']:
        args.vocoder = 'griffinlim'
    elif args.vocoder in ['wavernn', 'wr']:
        args.vocoder = 'wavernn'
    else:
        raise argparse.ArgumentError('Must provide a valid vocoder type!')

    hp.configure(args.hp_file)  # Load hparams from file

    tts_weights = args.tts_weights
    save_attn = args.save_attn

    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    if not args.force_cpu and torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    print('Using device:', device)

    if args.vocoder == 'wavernn':
        # set defaults for any arguments that depend on hparams
        if args.target is None:
            args.target = hp.voc_target
        if args.overlap is None:
            args.overlap = hp.voc_overlap
        if args.batched is None:
            args.batched = hp.voc_gen_batched

        batched = args.batched
        target = int(args.target)
        overlap = int(args.overlap)

        print('\nInitialising WaveRNN Model...\n')
        # Instantiate WaveRNN Model
        voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                            fc_dims=hp.voc_fc_dims,
                            bits=hp.bits,
                            pad=hp.voc_pad,
                            upsample_factors=hp.voc_upsample_factors,
                            feat_dims=hp.num_mels,
                            compute_dims=hp.voc_compute_dims,
                            res_out_dims=hp.voc_res_out_dims,
                            res_blocks=hp.voc_res_blocks,
                            hop_length=hp.hop_length,
                            sample_rate=hp.sample_rate,
                            mode=hp.voc_mode).to(device)

        voc_load_path = args.voc_weights if args.voc_weights else paths.voc_latest_weights
        voc_model.load(voc_load_path)
    else:
        voc_model = None
        batched = None
        target = None
        overlap = None

    print('\nInitialising Tacotron Model...\n')

    # Instantiate Tacotron Model
    tts_model = Tacotron(embed_dims=hp.tts_embed_dims,
                         num_chars=len(symbols),
                         encoder_dims=hp.tts_encoder_dims,
                         decoder_dims=hp.tts_decoder_dims,
                         n_mels=hp.num_mels,
                         fft_bins=hp.num_mels,
                         postnet_dims=hp.tts_postnet_dims,
                         encoder_K=hp.tts_encoder_K,
                         lstm_dims=hp.tts_lstm_dims,
                         postnet_K=hp.tts_postnet_K,
                         num_highways=hp.tts_num_highways,
                         dropout=hp.tts_dropout,
                         stop_threshold=hp.tts_stop_threshold).to(device)

    tts_load_path = tts_weights if tts_weights else paths.tts_latest_weights
    tts_model.load(tts_load_path)
    return args, voc_model, tts_model, batched, target, overlap, save_attn
예제 #25
0
def main(args):
    # Setup the dataset
    train_dataset = LJSpeechDataset(os.path.join(c.data_path, 'train_metadata.csv'),
                                    os.path.join(c.data_path, 'wavs'),
                                    c.r,
                                    c.sample_rate,
                                    c.text_cleaner,
                                    c.num_mels,
                                    c.min_level_db,
                                    c.frame_shift_ms,
                                    c.frame_length_ms,
                                    c.preemphasis,
                                    c.ref_level_db,
                                    c.num_freq,
                                    c.power,
                                    min_seq_len=c.min_seq_len
                                    )

    train_loader = DataLoader(train_dataset, batch_size=c.batch_size,
                              shuffle=False, collate_fn=train_dataset.collate_fn,
                              drop_last=False, num_workers=c.num_loader_workers,
                              pin_memory=True)
    '''   
    val_dataset = LJSpeechDataset(os.path.join(c.data_path, 'valid_metadata.csv'),
                                  os.path.join(c.data_path, 'wavs'),
                                  c.r,
                                  c.sample_rate,
                                  c.text_cleaner,
                                  c.num_mels,
                                  c.min_level_db,
                                  c.frame_shift_ms,
                                  c.frame_length_ms,
                                  c.preemphasis,
                                  c.ref_level_db,
                                  c.num_freq,
                                  c.power
                                  )

    val_loader = DataLoader(val_dataset, batch_size=c.eval_batch_size,
                            shuffle=False, collate_fn=val_dataset.collate_fn,
                            drop_last=False, num_workers=4,
                            pin_memory=True)
    
    '''
    model = Tacotron(c.embedding_size,
                     c.num_freq,
                     c.num_mels,
                     c.r)

    optimizer = optim.Adam(model.parameters(), lr=c.lr)
    optimizer_st = optim.Adam(model.decoder.stopnet.parameters(), lr=c.lr)

    criterion = L1LossMasked()
    criterion_st = nn.BCELoss()

    if args.restore_path:
        checkpoint = torch.load(args.restore_path)
        model.load_state_dict(checkpoint['model'])
        optimizer = optim.Adam(model.parameters(), lr=c.lr)
        optimizer.load_state_dict(checkpoint['optimizer'])
        for state in optimizer.state.values():
            for k, v in state.items():
                if torch.is_tensor(v):
                    state[k] = v.cuda()
        print(" > Model restored from step %d" % checkpoint['step'])
        start_epoch = checkpoint['step'] // len(train_loader)
        best_loss = checkpoint['linear_loss']
        start_epoch = 0
        args.restore_step = checkpoint['step']
        optimizer_st = optim.Adam(model.decoder.stopnet.parameters(), lr=c.lr)
    else:
        args.restore_step = 0
        print("\n > Starting a new training")

    if use_cuda:
        model = nn.DataParallel(model.cuda())
        criterion.cuda()
        criterion_st.cuda()

    num_params = count_parameters(model)
    print(" | > Model has {} parameters".format(num_params))

    if not os.path.exists(CHECKPOINT_PATH):
        os.mkdir(CHECKPOINT_PATH)

    if 'best_loss' not in locals():
        best_loss = float('inf')

    for epoch in range(0, c.epochs):
        train_loss, current_step = train(
            model, criterion, criterion_st, train_loader, optimizer, optimizer_st, epoch)
        #val_loss = evaluate(model, criterion, criterion_st, val_loader, current_step)
        best_loss = save_best_model(model, optimizer, train_loss, best_loss, OUT_PATH, current_step, epoch)
예제 #26
0
class TaiwaneseTacotron():
    def __init__(self):
        # Parse Arguments
        parser = argparse.ArgumentParser(description='TTS')
        self.args = parser.parse_args()
        self.args.vocoder = 'wavernn'
        self.args.hp_file = 'hparams.py'
        self.args.voc_weights = False
        self.args.tts_weights = False
        self.args.save_attn = False
        self.args.batched = True
        self.args.target = None
        self.args.overlap = None
        self.args.force_cpu = False
        #================ vocoder ================#
        if self.args.vocoder in ['griffinlim', 'gl']:
            self.args.vocoder = 'griffinlim'
        elif self.args.vocoder in ['wavernn', 'wr']:
            self.args.vocoder = 'wavernn'
        else:
            raise argparse.ArgumentError('Must provide a valid vocoder type!')

        hp.configure(self.args.hp_file)  # Load hparams from file

        # set defaults for any arguments that depend on hparams
        if self.args.vocoder == 'wavernn':
            if self.args.target is None:
                self.args.target = hp.voc_target
            if self.args.overlap is None:
                self.args.overlap = hp.voc_overlap
            if self.args.batched is None:
                self.args.batched = hp.voc_gen_batched

        #================ others ================#
        paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)
        print("hello")
        print(paths.base)
        if not self.args.force_cpu and torch.cuda.is_available():
            device = torch.device('cuda')
        else:
            device = torch.device('cpu')
        print('Using device:', device)

        # === Wavernn === #
        if self.args.vocoder == 'wavernn':
            print('\nInitialising WaveRNN Model...\n')
            self.voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                                     fc_dims=hp.voc_fc_dims,
                                     bits=hp.bits,
                                     pad=hp.voc_pad,
                                     upsample_factors=hp.voc_upsample_factors,
                                     feat_dims=hp.num_mels,
                                     compute_dims=hp.voc_compute_dims,
                                     res_out_dims=hp.voc_res_out_dims,
                                     res_blocks=hp.voc_res_blocks,
                                     hop_length=hp.hop_length,
                                     sample_rate=hp.sample_rate,
                                     mode=hp.voc_mode).to(device)

            voc_load_path = self.args.voc_weights if self.args.voc_weights else paths.voc_latest_weights
            #print(paths.voc_latest_weights)
            self.voc_model.load(voc_load_path)

        # === Tacotron === #
        if hp.tts_model == 'tacotron':
            print('\nInitialising Tacotron Model...\n')
            self.tts_model = Tacotron(
                embed_dims=hp.tts_embed_dims,
                num_chars=len(symbols),
                encoder_dims=hp.tts_encoder_dims,
                decoder_dims=hp.tts_decoder_dims,
                n_mels=hp.num_mels,
                fft_bins=hp.num_mels,
                postnet_dims=hp.tts_postnet_dims,
                encoder_K=hp.tts_encoder_K,
                lstm_dims=hp.tts_lstm_dims,
                postnet_K=hp.tts_postnet_K,
                num_highways=hp.tts_num_highways,
                dropout=hp.tts_dropout,
                stop_threshold=hp.tts_stop_threshold).to(device)

            tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights
            self.tts_model.load(tts_load_path)

        # === Tacotron2 === #
        elif hp.tts_model == 'tacotron2':
            print('\nInitializing Tacotron2 Model...\n')
            self.tts_model = Tacotron2().to(device)
            tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights
            self.tts_model.load(tts_load_path)

        # === Infomation === #
        if hp.tts_model == 'tacotron':
            if self.args.vocoder == 'wavernn':
                voc_k = self.voc_model.get_step() // 1000
                tts_k = self.tts_model.get_step() // 1000

                simple_table([
                    ('Tacotron', str(tts_k) + 'k'), ('r', self.tts_model.r),
                    ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'),
                    ('Generation Mode',
                     'Batched' if self.args.batched else 'Unbatched'),
                    ('Target Samples',
                     self.args.target if self.args.batched else 'N/A'),
                    ('Overlap Samples',
                     self.args.overlap if self.args.batched else 'N/A')
                ])

            elif self.args.vocoder == 'griffinlim':
                tts_k = self.tts_model.get_step() // 1000
                simple_table([('Tacotron', str(tts_k) + 'k'),
                              ('r', self.tts_model.r),
                              ('Vocoder Type', 'Griffin-Lim'),
                              ('GL Iters', self.args.iters)])

        elif hp.tts_model == 'tacotron2':
            if self.args.vocoder == 'wavernn':
                voc_k = self.voc_model.get_step() // 1000
                tts_k = self.tts_model.get_step() // 1000

                simple_table([
                    ('Tacotron2', str(tts_k) + 'k'),
                    ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'),
                    ('Generation Mode',
                     'Batched' if self.args.batched else 'Unbatched'),
                    ('Target Samples',
                     self.args.target if self.args.batched else 'N/A'),
                    ('Overlap Samples',
                     self.args.overlap if self.args.batched else 'N/A')
                ])

            elif self.args.vocoder == 'griffinlim':
                tts_k = self.tts_model.get_step() // 1000
                simple_table([('Tacotron2', str(tts_k) + 'k'),
                              ('Vocoder Type', 'Griffin-Lim'),
                              ('GL Iters', self.args.iters)])

    def generate(self, 華, input_text):
        inputs = [text_to_sequence(input_text.strip(), ['basic_cleaners'])]
        if hp.tts_model == 'tacotron2':
            self.gen_tacotron2(華, inputs)

        elif hp.tts_model == 'tacotron':
            self.gen_tacotron(華, inputs)

        else:
            print(f"Wrong tts model type {{{tts_model_type}}}")

        print('\n\nDone.\n')

    # custom function
    def gen_tacotron2(self, 華, inputs):
        for i, x in enumerate(inputs, 1):
            print(f'\n| Generating {i}/{len(inputs)}')
            print(x)

            x = np.array(x)[None, :]
            x = torch.autograd.Variable(torch.from_numpy(x)).cuda().long()

            self.tts_model.eval()
            mel_outputs, mel_outputs_postnet, _, alignments = self.tts_model.inference(
                x)
            if self.args.vocoder == 'griffinlim':
                v_type = self.args.vocoder
            elif self.args.vocoder == 'wavernn' and self.args.batched:
                v_type = 'wavernn_batched'
            else:
                v_type = 'wavernn_unbatched'

            # == define output name == #
            if len(華) == 0:
                output_name = re.split(r'\,|\.|\!|\?| ', input_text)[0]
            elif 1 <= len(華) <= 9:
                output_name = 華[:-1]
            elif 9 < len(華):
                output_name = 華[:8]
            print(output_name)
            save_path = "output/{}.wav".format(output_name)
            ##

            if self.args.vocoder == 'wavernn':
                m = mel_outputs_postnet
                self.voc_model.generate(m, save_path, self.args.batched,
                                        hp.voc_target, hp.voc_overlap,
                                        hp.mu_law)

            elif self.args.vocoder == 'griffinlim':
                m = torch.squeeze(mel_outputs_postnet).detach().cpu().numpy()
                wav = reconstruct_waveform(m, n_iter=self.args.iters)
                save_wav(wav, save_path)

    # custom function
    def gen_tacotron(self, 華, inputs):
        for i, x in enumerate(inputs, 1):
            print(f'\n| Generating {i}/{len(inputs)}')
            _, m, attention = self.tts_model.generate(x)
            # Fix mel spectrogram scaling to be from 0 to 1
            m = (m + 4) / 8
            np.clip(m, 0, 1, out=m)

            if self.args.vocoder == 'griffinlim':
                v_type = self.args.vocoder
            elif self.args.vocoder == 'wavernn' and self.args.batched:
                v_type = 'wavernn_batched'
            else:
                v_type = 'wavernn_unbatched'
            # == define output name == #
            if len(華) == 0:
                output_name = re.split(r'\,|\.|\!|\?| ', input_text)[0]
            elif 1 <= len(華) <= 9:
                output_name = 華[:-1]
            elif 9 < len(華):
                output_name = 華[:8]
            print(output_name)
            save_path = "output/{}.wav".format(output_name)
            ##
            if self.args.vocoder == 'wavernn':
                m = torch.tensor(m).unsqueeze(0)
                self.voc_model.generate(m, save_path, self.args.batched,
                                        hp.voc_target, hp.voc_overlap,
                                        hp.mu_law)

            elif self.args.vocoder == 'griffinlim':
                wav = reconstruct_waveform(m, n_iter=self.args.iters)
                save_wav(wav, save_path)
예제 #27
0
    args = parser.parse_args()

    batch_size = args.batch_size
    force_train = args.force_train
    force_gta = args.force_gta
    lr = args.lr

    print('\nInitialising Tacotron Model...\n')

    # Instantiate Tacotron Model
    model = Tacotron(embed_dims=hp.tts_embed_dims,
                     num_chars=len(symbols),
                     encoder_dims=hp.tts_encoder_dims,
                     decoder_dims=hp.tts_decoder_dims,
                     n_mels=hp.num_mels,
                     fft_bins=hp.num_mels,
                     postnet_dims=hp.tts_postnet_dims,
                     encoder_K=hp.tts_encoder_K,
                     lstm_dims=hp.tts_lstm_dims,
                     postnet_K=hp.tts_postnet_K,
                     num_highways=hp.tts_num_highways,
                     dropout=hp.tts_dropout).cuda()

    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    model.restore(paths.tts_latest_weights)

    # model.reset_step()

    # model.set_r(hp.tts_r)

    optimiser = optim.Adam(model.parameters())
예제 #28
0
    def __init__(self):
        # Parse Arguments
        parser = argparse.ArgumentParser(description='TTS')
        self.args = parser.parse_args()
        self.args.vocoder = 'wavernn'
        self.args.hp_file = 'hparams.py'
        self.args.voc_weights = False
        self.args.tts_weights = False
        self.args.save_attn = False
        self.args.batched = True
        self.args.target = None
        self.args.overlap = None
        self.args.force_cpu = False
        #================ vocoder ================#
        if self.args.vocoder in ['griffinlim', 'gl']:
            self.args.vocoder = 'griffinlim'
        elif self.args.vocoder in ['wavernn', 'wr']:
            self.args.vocoder = 'wavernn'
        else:
            raise argparse.ArgumentError('Must provide a valid vocoder type!')

        hp.configure(self.args.hp_file)  # Load hparams from file

        # set defaults for any arguments that depend on hparams
        if self.args.vocoder == 'wavernn':
            if self.args.target is None:
                self.args.target = hp.voc_target
            if self.args.overlap is None:
                self.args.overlap = hp.voc_overlap
            if self.args.batched is None:
                self.args.batched = hp.voc_gen_batched

        #================ others ================#
        paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)
        print("hello")
        print(paths.base)
        if not self.args.force_cpu and torch.cuda.is_available():
            device = torch.device('cuda')
        else:
            device = torch.device('cpu')
        print('Using device:', device)

        # === Wavernn === #
        if self.args.vocoder == 'wavernn':
            print('\nInitialising WaveRNN Model...\n')
            self.voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                                     fc_dims=hp.voc_fc_dims,
                                     bits=hp.bits,
                                     pad=hp.voc_pad,
                                     upsample_factors=hp.voc_upsample_factors,
                                     feat_dims=hp.num_mels,
                                     compute_dims=hp.voc_compute_dims,
                                     res_out_dims=hp.voc_res_out_dims,
                                     res_blocks=hp.voc_res_blocks,
                                     hop_length=hp.hop_length,
                                     sample_rate=hp.sample_rate,
                                     mode=hp.voc_mode).to(device)

            voc_load_path = self.args.voc_weights if self.args.voc_weights else paths.voc_latest_weights
            #print(paths.voc_latest_weights)
            self.voc_model.load(voc_load_path)

        # === Tacotron === #
        if hp.tts_model == 'tacotron':
            print('\nInitialising Tacotron Model...\n')
            self.tts_model = Tacotron(
                embed_dims=hp.tts_embed_dims,
                num_chars=len(symbols),
                encoder_dims=hp.tts_encoder_dims,
                decoder_dims=hp.tts_decoder_dims,
                n_mels=hp.num_mels,
                fft_bins=hp.num_mels,
                postnet_dims=hp.tts_postnet_dims,
                encoder_K=hp.tts_encoder_K,
                lstm_dims=hp.tts_lstm_dims,
                postnet_K=hp.tts_postnet_K,
                num_highways=hp.tts_num_highways,
                dropout=hp.tts_dropout,
                stop_threshold=hp.tts_stop_threshold).to(device)

            tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights
            self.tts_model.load(tts_load_path)

        # === Tacotron2 === #
        elif hp.tts_model == 'tacotron2':
            print('\nInitializing Tacotron2 Model...\n')
            self.tts_model = Tacotron2().to(device)
            tts_load_path = self.args.tts_weights if self.args.tts_weights else paths.tts_latest_weights
            self.tts_model.load(tts_load_path)

        # === Infomation === #
        if hp.tts_model == 'tacotron':
            if self.args.vocoder == 'wavernn':
                voc_k = self.voc_model.get_step() // 1000
                tts_k = self.tts_model.get_step() // 1000

                simple_table([
                    ('Tacotron', str(tts_k) + 'k'), ('r', self.tts_model.r),
                    ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'),
                    ('Generation Mode',
                     'Batched' if self.args.batched else 'Unbatched'),
                    ('Target Samples',
                     self.args.target if self.args.batched else 'N/A'),
                    ('Overlap Samples',
                     self.args.overlap if self.args.batched else 'N/A')
                ])

            elif self.args.vocoder == 'griffinlim':
                tts_k = self.tts_model.get_step() // 1000
                simple_table([('Tacotron', str(tts_k) + 'k'),
                              ('r', self.tts_model.r),
                              ('Vocoder Type', 'Griffin-Lim'),
                              ('GL Iters', self.args.iters)])

        elif hp.tts_model == 'tacotron2':
            if self.args.vocoder == 'wavernn':
                voc_k = self.voc_model.get_step() // 1000
                tts_k = self.tts_model.get_step() // 1000

                simple_table([
                    ('Tacotron2', str(tts_k) + 'k'),
                    ('Vocoder Type', 'WaveRNN'), ('WaveRNN', str(voc_k) + 'k'),
                    ('Generation Mode',
                     'Batched' if self.args.batched else 'Unbatched'),
                    ('Target Samples',
                     self.args.target if self.args.batched else 'N/A'),
                    ('Overlap Samples',
                     self.args.overlap if self.args.batched else 'N/A')
                ])

            elif self.args.vocoder == 'griffinlim':
                tts_k = self.tts_model.get_step() // 1000
                simple_table([('Tacotron2', str(tts_k) + 'k'),
                              ('Vocoder Type', 'Griffin-Lim'),
                              ('GL Iters', self.args.iters)])
예제 #29
0
파일: train.py 프로젝트: Vproject/TTS
def main(args):

    # setup output paths and read configs
    c = load_config(args.config_path)
    _ = os.path.dirname(os.path.realpath(__file__))
    OUT_PATH = os.path.join(_, c.output_path)
    OUT_PATH = create_experiment_folder(OUT_PATH)
    CHECKPOINT_PATH = os.path.join(OUT_PATH, 'checkpoints')
    shutil.copyfile(args.config_path, os.path.join(OUT_PATH, 'config.json'))

    # save config to tmp place to be loaded by subsequent modules.
    file_name = str(os.getpid())
    tmp_path = os.path.join("/tmp/", file_name+'_tts')
    pickle.dump(c, open(tmp_path, "wb"))

    # setup tensorboard
    LOG_DIR = OUT_PATH
    tb = SummaryWriter(LOG_DIR)

    # Ctrl+C handler to remove empty experiment folder
    def signal_handler(signal, frame):
        print(" !! Pressed Ctrl+C !!")
        remove_experiment_folder(OUT_PATH)
        sys.exit(1)
    signal.signal(signal.SIGINT, signal_handler)

    # Setup the dataset
    dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata.csv'),
                              os.path.join(c.data_path, 'wavs'),
                              c.r,
                              c.sample_rate,
                              c.text_cleaner,
                              c.num_mels,
                              c.min_level_db,
                              c.frame_shift_ms,
                              c.frame_length_ms,
                              c.preemphasis,
                              c.ref_level_db,
                              c.num_freq,
                              c.power
                             )

    dataloader = DataLoader(dataset, batch_size=c.batch_size,
                            shuffle=True, collate_fn=dataset.collate_fn,
                            drop_last=True, num_workers=c.num_loader_workers)

    # setup the model
    model = Tacotron(c.embedding_size,
                     c.hidden_size,
                     c.num_mels,
                     c.num_freq,
                     c.r)

    # plot model on tensorboard
    dummy_input = dataset.get_dummy_data()

    ## TODO: onnx does not support RNN fully yet
    # model_proto_path = os.path.join(OUT_PATH, "model.proto")
    # onnx.export(model, dummy_input, model_proto_path, verbose=True)
    # tb.add_graph_onnx(model_proto_path)

    if use_cuda:
        model = nn.DataParallel(model.cuda())

    optimizer = optim.Adam(model.parameters(), lr=c.lr)

    if args.restore_step:
        checkpoint = torch.load(os.path.join(
            args.restore_path, 'checkpoint_%d.pth.tar' % args.restore_step))
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        print("\n > Model restored from step %d\n" % args.restore_step)
        start_epoch = checkpoint['step'] // len(dataloader)
        best_loss = checkpoint['linear_loss']
    else:
        start_epoch = 0
        print("\n > Starting a new training")

    num_params = count_parameters(model)
    print(" | > Model has {} parameters".format(num_params))

    model = model.train()

    if not os.path.exists(CHECKPOINT_PATH):
        os.mkdir(CHECKPOINT_PATH)

    if use_cuda:
        criterion = nn.L1Loss().cuda()
    else:
        criterion = nn.L1Loss()

    n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)

    #lr_scheduler = ReduceLROnPlateau(optimizer, factor=c.lr_decay,
    #                               patience=c.lr_patience, verbose=True)
    epoch_time = 0
    best_loss = float('inf')
    for epoch in range(0, c.epochs):

        print("\n | > Epoch {}/{}".format(epoch, c.epochs))
        progbar = Progbar(len(dataset) / c.batch_size)

        for num_iter, data in enumerate(dataloader):
            start_time = time.time()

            text_input = data[0]
            text_lengths = data[1]
            linear_input = data[2]
            mel_input = data[3]

            current_step = num_iter + args.restore_step + epoch * len(dataloader) + 1

            # setup lr
            current_lr = lr_decay(c.lr, current_step)
            for params_group in optimizer.param_groups:
                params_group['lr'] = current_lr

            optimizer.zero_grad()

            # Add a single frame of zeros to Mel Specs for better end detection
            #try:
            #    mel_input = np.concatenate((np.zeros(
            #        [c.batch_size, 1, c.num_mels], dtype=np.float32),
            #        mel_input[:, 1:, :]), axis=1)
            #except:
            #    raise TypeError("not same dimension")

            # convert inputs to variables
            text_input_var = Variable(text_input)
            mel_spec_var = Variable(mel_input)
            linear_spec_var = Variable(linear_input, volatile=True)

            # sort sequence by length.
            # TODO: might be unnecessary
            sorted_lengths, indices = torch.sort(
                     text_lengths.view(-1), dim=0, descending=True)
            sorted_lengths = sorted_lengths.long().numpy()

            text_input_var = text_input_var[indices]
            mel_spec_var = mel_spec_var[indices]
            linear_spec_var = linear_spec_var[indices]

            if use_cuda:
                text_input_var = text_input_var.cuda()
                mel_spec_var = mel_spec_var.cuda()
                linear_spec_var = linear_spec_var.cuda()

            mel_output, linear_output, alignments =\
                model.forward(text_input_var, mel_spec_var,
                              input_lengths= torch.autograd.Variable(torch.cuda.LongTensor(sorted_lengths)))

            mel_loss = criterion(mel_output, mel_spec_var)
            #linear_loss = torch.abs(linear_output - linear_spec_var)
            #linear_loss = 0.5 * \
                #torch.mean(linear_loss) + 0.5 * \
                #torch.mean(linear_loss[:, :n_priority_freq, :])
            linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \
                    + 0.5 * criterion(linear_output[:, :, :n_priority_freq],
                                      linear_spec_var[: ,: ,:n_priority_freq])
            loss = mel_loss + linear_loss
            # loss = loss.cuda()

            loss.backward()
            grad_norm = nn.utils.clip_grad_norm(model.parameters(), 1.)  ## TODO: maybe no need
            optimizer.step()

            step_time = time.time() - start_time
            epoch_time += step_time

            progbar.update(num_iter+1, values=[('total_loss', loss.data[0]),
                                       ('linear_loss', linear_loss.data[0]),
                                       ('mel_loss', mel_loss.data[0]),
                                       ('grad_norm', grad_norm)])

            # Plot Learning Stats
            tb.add_scalar('Loss/TotalLoss', loss.data[0], current_step)
            tb.add_scalar('Loss/LinearLoss', linear_loss.data[0],
                          current_step)
            tb.add_scalar('Loss/MelLoss', mel_loss.data[0], current_step)
            tb.add_scalar('Params/LearningRate', optimizer.param_groups[0]['lr'],
                          current_step)
            tb.add_scalar('Params/GradNorm', grad_norm, current_step)
            tb.add_scalar('Time/StepTime', step_time, current_step)

            align_img = alignments[0].data.cpu().numpy()
            align_img = plot_alignment(align_img)
            tb.add_image('Attn/Alignment', align_img, current_step)

            if current_step % c.save_step == 0:

                if c.checkpoint:
                    # save model
                    save_checkpoint(model, optimizer, linear_loss.data[0],
                                    OUT_PATH, current_step, epoch)

                # Diagnostic visualizations
                const_spec = linear_output[0].data.cpu().numpy()
                gt_spec = linear_spec_var[0].data.cpu().numpy()

                const_spec = plot_spectrogram(const_spec, dataset.ap)
                gt_spec = plot_spectrogram(gt_spec, dataset.ap)
                tb.add_image('Spec/Reconstruction', const_spec, current_step)
                tb.add_image('Spec/GroundTruth', gt_spec, current_step)

                align_img = alignments[0].data.cpu().numpy()
                align_img = plot_alignment(align_img)
                tb.add_image('Attn/Alignment', align_img, current_step)

                # Sample audio
                audio_signal = linear_output[0].data.cpu().numpy()
                dataset.ap.griffin_lim_iters = 60
                audio_signal = dataset.ap.inv_spectrogram(audio_signal.T)
                try:
                    tb.add_audio('SampleAudio', audio_signal, current_step,
                                 sample_rate=c.sample_rate)
                except:
                    print("\n > Error at audio signal on TB!!")
                    print(audio_signal.max())
                    print(audio_signal.min())


        # average loss after the epoch
        avg_epoch_loss = np.mean(
            progbar.sum_values['linear_loss'][0] / max(1, progbar.sum_values['linear_loss'][1]))
        best_loss = save_best_model(model, optimizer, avg_epoch_loss,
                                    best_loss, OUT_PATH,
                                    current_step, epoch)

        #lr_scheduler.step(loss.data[0])
        tb.add_scalar('Time/EpochTime', epoch_time, epoch)
        epoch_time = 0
예제 #30
0
            if batch_size % torch.cuda.device_count() != 0:
                raise ValueError(
                    '`batch_size` must be evenly divisible by n_gpus!')
    else:
        device = torch.device('cpu')
    print('Using device:', device)

    # Instantiate Tacotron Model
    print('\nInitialising Tacotron Model...\n')
    model = Tacotron(embed_dims=hp.tts_embed_dims,
                     num_chars=len(phonemes),
                     encoder_dims=hp.tts_encoder_dims,
                     decoder_dims=hp.tts_decoder_dims,
                     n_mels=hp.num_mels,
                     fft_bins=hp.num_mels,
                     postnet_dims=hp.tts_postnet_dims,
                     encoder_K=hp.tts_encoder_K,
                     lstm_dims=hp.tts_lstm_dims,
                     postnet_K=hp.tts_postnet_K,
                     num_highways=hp.tts_num_highways,
                     dropout=hp.tts_dropout,
                     stop_threshold=hp.tts_stop_threshold).to(device)

    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    print(f'Num Params: {params}')
    optimizer = optim.Adam(model.parameters())
    restore_checkpoint('tts',
                       paths,
                       model,
                       optimizer,