class Synthesizer(object):
    def load_model(self, model_path, model_name, model_config, use_cuda):
        model_config = os.path.join(model_path, model_config)
        self.model_file = os.path.join(model_path, model_name)
        print(" > Loading model ...")
        print(" | > model config: ", model_config)
        print(" | > model file: ", self.model_file)
        config = load_config(model_config)
        self.config = config
        self.use_cuda = use_cuda
        self.ap = AudioProcessor(**config.audio)
        self.model = Tacotron(61, config.embedding_size, self.ap.num_freq,
                              self.ap.num_mels, config.r)
        # load model state
        if use_cuda:
            cp = torch.load(self.model_file)
        else:
            cp = torch.load(self.model_file,
                            map_location=lambda storage, loc: storage)
        # load the model
        self.model.load_state_dict(cp['model'])
        if use_cuda:
            self.model.cuda()
        self.model.eval()

    def save_wav(self, wav, path):
        # wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
        wav = np.array(wav)
        self.ap.save_wav(wav, path)

    def tts(self, text):
        text_cleaner = [self.config.text_cleaner]
        wavs = []
        for sen in text.split('.'):
            if len(sen) < 3:
                continue
            sen = sen.strip()
            sen += '.'
            print(sen)
            sen = sen.strip()
            seq = np.array(
                phoneme_to_sequence(sen, text_cleaner,
                                    self.config.phoneme_language))
            chars_var = torch.from_numpy(seq).unsqueeze(0).long()
            if self.use_cuda:
                chars_var = chars_var.cuda()
            mel_out, linear_out, alignments, stop_tokens = self.model.forward(
                chars_var)
            linear_out = linear_out[0].data.cpu().numpy()
            wav = self.ap.inv_spectrogram(linear_out.T)
            wavs += list(wav)
            wavs += [0] * 10000

        out = io.BytesIO()
        self.save_wav(wavs, out)

        return out
Exemplo n.º 2
0
def tacotron(pretrained=True, **kwargs):
    model = Tacotron(embed_dims=hp.tts_embed_dims,
                     num_chars=len(symbols),
                     encoder_dims=hp.tts_encoder_dims,
                     decoder_dims=hp.tts_decoder_dims,
                     n_mels=hp.num_mels,
                     fft_bins=hp.num_mels,
                     postnet_dims=hp.tts_postnet_dims,
                     encoder_K=hp.tts_encoder_K,
                     lstm_dims=hp.tts_lstm_dims,
                     postnet_K=hp.tts_postnet_K,
                     num_highways=hp.tts_num_highways,
                     dropout=hp.tts_dropout,
                     stop_threshold=hp.tts_stop_threshold)
    if pretrained:
        state_dict = fetch_and_load_state_dict("tacotron")
        state_dict["decoder.r"] = state_dict.pop("r")
        state_dict["stop_threshold"] = torch.tensor(hp.tts_stop_threshold,
                                                    dtype=torch.float32)
        model.load_state_dict(state_dict)
    return model
Exemplo n.º 3
0
def main(args):

    # setup output paths and read configs
    c = load_config(args.config_path)
    _ = os.path.dirname(os.path.realpath(__file__))
    OUT_PATH = os.path.join(_, c.output_path)
    OUT_PATH = create_experiment_folder(OUT_PATH)
    CHECKPOINT_PATH = os.path.join(OUT_PATH, 'checkpoints')
    shutil.copyfile(args.config_path, os.path.join(OUT_PATH, 'config.json'))

    # save config to tmp place to be loaded by subsequent modules.
    file_name = str(os.getpid())
    tmp_path = os.path.join("/tmp/", file_name+'_tts')
    pickle.dump(c, open(tmp_path, "wb"))

    # setup tensorboard
    LOG_DIR = OUT_PATH
    tb = SummaryWriter(LOG_DIR)

    # Ctrl+C handler to remove empty experiment folder
    def signal_handler(signal, frame):
        print(" !! Pressed Ctrl+C !!")
        remove_experiment_folder(OUT_PATH)
        sys.exit(1)
    signal.signal(signal.SIGINT, signal_handler)

    # Setup the dataset
    dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata.csv'),
                              os.path.join(c.data_path, 'wavs'),
                              c.r,
                              c.sample_rate,
                              c.text_cleaner,
                              c.num_mels,
                              c.min_level_db,
                              c.frame_shift_ms,
                              c.frame_length_ms,
                              c.preemphasis,
                              c.ref_level_db,
                              c.num_freq,
                              c.power
                             )

    dataloader = DataLoader(dataset, batch_size=c.batch_size,
                            shuffle=True, collate_fn=dataset.collate_fn,
                            drop_last=True, num_workers=c.num_loader_workers)

    # setup the model
    model = Tacotron(c.embedding_size,
                     c.hidden_size,
                     c.num_mels,
                     c.num_freq,
                     c.r)

    # plot model on tensorboard
    dummy_input = dataset.get_dummy_data()

    ## TODO: onnx does not support RNN fully yet
    # model_proto_path = os.path.join(OUT_PATH, "model.proto")
    # onnx.export(model, dummy_input, model_proto_path, verbose=True)
    # tb.add_graph_onnx(model_proto_path)

    if use_cuda:
        model = nn.DataParallel(model.cuda())

    optimizer = optim.Adam(model.parameters(), lr=c.lr)

    if args.restore_step:
        checkpoint = torch.load(os.path.join(
            args.restore_path, 'checkpoint_%d.pth.tar' % args.restore_step))
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        print("\n > Model restored from step %d\n" % args.restore_step)
        start_epoch = checkpoint['step'] // len(dataloader)
        best_loss = checkpoint['linear_loss']
    else:
        start_epoch = 0
        print("\n > Starting a new training")

    num_params = count_parameters(model)
    print(" | > Model has {} parameters".format(num_params))

    model = model.train()

    if not os.path.exists(CHECKPOINT_PATH):
        os.mkdir(CHECKPOINT_PATH)

    if use_cuda:
        criterion = nn.L1Loss().cuda()
    else:
        criterion = nn.L1Loss()

    n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)

    #lr_scheduler = ReduceLROnPlateau(optimizer, factor=c.lr_decay,
    #                               patience=c.lr_patience, verbose=True)
    epoch_time = 0
    best_loss = float('inf')
    for epoch in range(0, c.epochs):

        print("\n | > Epoch {}/{}".format(epoch, c.epochs))
        progbar = Progbar(len(dataset) / c.batch_size)

        for num_iter, data in enumerate(dataloader):
            start_time = time.time()

            text_input = data[0]
            text_lengths = data[1]
            linear_input = data[2]
            mel_input = data[3]

            current_step = num_iter + args.restore_step + epoch * len(dataloader) + 1

            # setup lr
            current_lr = lr_decay(c.lr, current_step)
            for params_group in optimizer.param_groups:
                params_group['lr'] = current_lr

            optimizer.zero_grad()

            # Add a single frame of zeros to Mel Specs for better end detection
            #try:
            #    mel_input = np.concatenate((np.zeros(
            #        [c.batch_size, 1, c.num_mels], dtype=np.float32),
            #        mel_input[:, 1:, :]), axis=1)
            #except:
            #    raise TypeError("not same dimension")

            # convert inputs to variables
            text_input_var = Variable(text_input)
            mel_spec_var = Variable(mel_input)
            linear_spec_var = Variable(linear_input, volatile=True)

            # sort sequence by length.
            # TODO: might be unnecessary
            sorted_lengths, indices = torch.sort(
                     text_lengths.view(-1), dim=0, descending=True)
            sorted_lengths = sorted_lengths.long().numpy()

            text_input_var = text_input_var[indices]
            mel_spec_var = mel_spec_var[indices]
            linear_spec_var = linear_spec_var[indices]

            if use_cuda:
                text_input_var = text_input_var.cuda()
                mel_spec_var = mel_spec_var.cuda()
                linear_spec_var = linear_spec_var.cuda()

            mel_output, linear_output, alignments =\
                model.forward(text_input_var, mel_spec_var,
                              input_lengths= torch.autograd.Variable(torch.cuda.LongTensor(sorted_lengths)))

            mel_loss = criterion(mel_output, mel_spec_var)
            #linear_loss = torch.abs(linear_output - linear_spec_var)
            #linear_loss = 0.5 * \
                #torch.mean(linear_loss) + 0.5 * \
                #torch.mean(linear_loss[:, :n_priority_freq, :])
            linear_loss = 0.5 * criterion(linear_output, linear_spec_var) \
                    + 0.5 * criterion(linear_output[:, :, :n_priority_freq],
                                      linear_spec_var[: ,: ,:n_priority_freq])
            loss = mel_loss + linear_loss
            # loss = loss.cuda()

            loss.backward()
            grad_norm = nn.utils.clip_grad_norm(model.parameters(), 1.)  ## TODO: maybe no need
            optimizer.step()

            step_time = time.time() - start_time
            epoch_time += step_time

            progbar.update(num_iter+1, values=[('total_loss', loss.data[0]),
                                       ('linear_loss', linear_loss.data[0]),
                                       ('mel_loss', mel_loss.data[0]),
                                       ('grad_norm', grad_norm)])

            # Plot Learning Stats
            tb.add_scalar('Loss/TotalLoss', loss.data[0], current_step)
            tb.add_scalar('Loss/LinearLoss', linear_loss.data[0],
                          current_step)
            tb.add_scalar('Loss/MelLoss', mel_loss.data[0], current_step)
            tb.add_scalar('Params/LearningRate', optimizer.param_groups[0]['lr'],
                          current_step)
            tb.add_scalar('Params/GradNorm', grad_norm, current_step)
            tb.add_scalar('Time/StepTime', step_time, current_step)

            align_img = alignments[0].data.cpu().numpy()
            align_img = plot_alignment(align_img)
            tb.add_image('Attn/Alignment', align_img, current_step)

            if current_step % c.save_step == 0:

                if c.checkpoint:
                    # save model
                    save_checkpoint(model, optimizer, linear_loss.data[0],
                                    OUT_PATH, current_step, epoch)

                # Diagnostic visualizations
                const_spec = linear_output[0].data.cpu().numpy()
                gt_spec = linear_spec_var[0].data.cpu().numpy()

                const_spec = plot_spectrogram(const_spec, dataset.ap)
                gt_spec = plot_spectrogram(gt_spec, dataset.ap)
                tb.add_image('Spec/Reconstruction', const_spec, current_step)
                tb.add_image('Spec/GroundTruth', gt_spec, current_step)

                align_img = alignments[0].data.cpu().numpy()
                align_img = plot_alignment(align_img)
                tb.add_image('Attn/Alignment', align_img, current_step)

                # Sample audio
                audio_signal = linear_output[0].data.cpu().numpy()
                dataset.ap.griffin_lim_iters = 60
                audio_signal = dataset.ap.inv_spectrogram(audio_signal.T)
                try:
                    tb.add_audio('SampleAudio', audio_signal, current_step,
                                 sample_rate=c.sample_rate)
                except:
                    print("\n > Error at audio signal on TB!!")
                    print(audio_signal.max())
                    print(audio_signal.min())


        # average loss after the epoch
        avg_epoch_loss = np.mean(
            progbar.sum_values['linear_loss'][0] / max(1, progbar.sum_values['linear_loss'][1]))
        best_loss = save_best_model(model, optimizer, avg_epoch_loss,
                                    best_loss, OUT_PATH,
                                    current_step, epoch)

        #lr_scheduler.step(loss.data[0])
        tb.add_scalar('Time/EpochTime', epoch_time, epoch)
        epoch_time = 0
Exemplo n.º 4
0
def main(args):
    # Setup the dataset
    train_dataset = LJSpeechDataset(os.path.join(c.data_path, 'train_metadata.csv'),
                                    os.path.join(c.data_path, 'wavs'),
                                    c.r,
                                    c.sample_rate,
                                    c.text_cleaner,
                                    c.num_mels,
                                    c.min_level_db,
                                    c.frame_shift_ms,
                                    c.frame_length_ms,
                                    c.preemphasis,
                                    c.ref_level_db,
                                    c.num_freq,
                                    c.power,
                                    min_seq_len=c.min_seq_len
                                    )

    train_loader = DataLoader(train_dataset, batch_size=c.batch_size,
                              shuffle=False, collate_fn=train_dataset.collate_fn,
                              drop_last=False, num_workers=c.num_loader_workers,
                              pin_memory=True)
    '''   
    val_dataset = LJSpeechDataset(os.path.join(c.data_path, 'valid_metadata.csv'),
                                  os.path.join(c.data_path, 'wavs'),
                                  c.r,
                                  c.sample_rate,
                                  c.text_cleaner,
                                  c.num_mels,
                                  c.min_level_db,
                                  c.frame_shift_ms,
                                  c.frame_length_ms,
                                  c.preemphasis,
                                  c.ref_level_db,
                                  c.num_freq,
                                  c.power
                                  )

    val_loader = DataLoader(val_dataset, batch_size=c.eval_batch_size,
                            shuffle=False, collate_fn=val_dataset.collate_fn,
                            drop_last=False, num_workers=4,
                            pin_memory=True)
    
    '''
    model = Tacotron(c.embedding_size,
                     c.num_freq,
                     c.num_mels,
                     c.r)

    optimizer = optim.Adam(model.parameters(), lr=c.lr)
    optimizer_st = optim.Adam(model.decoder.stopnet.parameters(), lr=c.lr)

    criterion = L1LossMasked()
    criterion_st = nn.BCELoss()

    if args.restore_path:
        checkpoint = torch.load(args.restore_path)
        model.load_state_dict(checkpoint['model'])
        optimizer = optim.Adam(model.parameters(), lr=c.lr)
        optimizer.load_state_dict(checkpoint['optimizer'])
        for state in optimizer.state.values():
            for k, v in state.items():
                if torch.is_tensor(v):
                    state[k] = v.cuda()
        print(" > Model restored from step %d" % checkpoint['step'])
        start_epoch = checkpoint['step'] // len(train_loader)
        best_loss = checkpoint['linear_loss']
        start_epoch = 0
        args.restore_step = checkpoint['step']
        optimizer_st = optim.Adam(model.decoder.stopnet.parameters(), lr=c.lr)
    else:
        args.restore_step = 0
        print("\n > Starting a new training")

    if use_cuda:
        model = nn.DataParallel(model.cuda())
        criterion.cuda()
        criterion_st.cuda()

    num_params = count_parameters(model)
    print(" | > Model has {} parameters".format(num_params))

    if not os.path.exists(CHECKPOINT_PATH):
        os.mkdir(CHECKPOINT_PATH)

    if 'best_loss' not in locals():
        best_loss = float('inf')

    for epoch in range(0, c.epochs):
        train_loss, current_step = train(
            model, criterion, criterion_st, train_loader, optimizer, optimizer_st, epoch)
        #val_loss = evaluate(model, criterion, criterion_st, val_loader, current_step)
        best_loss = save_best_model(model, optimizer, train_loss, best_loss, OUT_PATH, current_step, epoch)
Exemplo n.º 5
0
Arquivo: eval.py Projeto: geneing/TTS
class Synthesizer(object):
    def load_model(self, model_path, model_config, wavernn_path, use_cuda):
        
        self.model_file = model_path
        print(" > Loading model ...")
        print(" | > model config: ", model_config)
        print(" | > model file: ", self.model_file)
        config = load_config(model_config)
        self.config = config
        self.use_cuda = use_cuda
        self.use_phonemes = config.use_phonemes
        self.ap = AudioProcessor(**config.audio)
        
        if self.use_phonemes:
            self.input_size = len(phonemes)
            self.input_adapter = lambda sen: phoneme_to_sequence(sen, [self.config.text_cleaner], self.config.phoneme_language)
        else:
            self.input_size = len(symbols)
            self.input_adapter = lambda sen: text_to_sequence(sen, [self.config.text_cleaner])
        
        self.model = Tacotron(self.input_size, config.embedding_size, self.ap.num_freq, self.ap.num_mels, config.r, attn_windowing=True)
        self.model.decoder.max_decoder_steps = 8000
        # load model state
        if use_cuda:
            cp = torch.load(self.model_file)
        else:
            cp = torch.load(self.model_file, map_location=lambda storage, loc: storage)
        # load the model
        self.model.load_state_dict(cp['model'])
        if use_cuda:
            self.model.cuda()
        self.model.eval()
        self.vocoder=WaveRNNVocoder.Vocoder()
        self.vocoder.loadWeights(wavernn_path)
        self.firwin = signal.firwin(1025, [65, 7600], pass_zero=False, fs=16000)


    def save_wav(self, wav, path):
        # wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
        wav = np.array(wav)
        self.ap.save_wav(wav, path)

    #split text into chunks that are smaller than maxlen. Preferably, split on punctuation.

    def ttmel(self, text):
        mel_ret = []
        text_list = split_text(text, maxlen)
        for t in text_list:
            if len(t) < 3:
                continue
            seq = np.array(self.input_adapter(t))
            
            chars_var = torch.from_numpy(seq).unsqueeze(0).long()
            if self.use_cuda:
                chars_var = chars_var.cuda()
            mel_out, _, alignments, stop_tokens = self.model.forward(chars_var)
            mel_out = mel_out[0].data.cpu().numpy().T
            mel_ret.append(mel_out)
        return np.hstack(mel_ret)

    def tts(self, mel):
        wav = self.vocoder.melToWav(mel)
        return wav
Exemplo n.º 6
0
Arquivo: train.py Projeto: wurde/TTS
def main(args):
    model = Tacotron(c.embedding_size, ap.num_freq, ap.num_mels, c.r)
    print(" | > Num output units : {}".format(ap.num_freq), flush=True)

    optimizer = optim.Adam(model.parameters(), lr=c.lr, weight_decay=0)
    optimizer_st = optim.Adam(
        model.decoder.stopnet.parameters(), lr=c.lr, weight_decay=0)

    criterion = L1LossMasked()
    criterion_st = nn.BCELoss()

    if args.restore_path:
        checkpoint = torch.load(args.restore_path)
        try:
            model.load_state_dict(checkpoint['model'])
        except:
            model_dict = model.state_dict()
            # Partial initialization: if there is a mismatch with new and old layer, it is skipped.
            # 1. filter out unnecessary keys
            pretrained_dict = {
                k: v
                for k, v in checkpoint['model'].items() if k in model_dict
            }
            # 2. overwrite entries in the existing state dict
            model_dict.update(pretrained_dict)
            # 3. load the new state dict
            model.load_state_dict(model_dict)
        if use_cuda:
            model = model.cuda()
            criterion.cuda()
            criterion_st.cuda()
        optimizer.load_state_dict(checkpoint['optimizer'])
        print(
            " > Model restored from step %d" % checkpoint['step'], flush=True)
        start_epoch = checkpoint['epoch']
        best_loss = checkpoint['linear_loss']
        args.restore_step = checkpoint['step']
    else:
        args.restore_step = 0
        print("\n > Starting a new training", flush=True)
        if use_cuda:
            model = model.cuda()
            criterion.cuda()
            criterion_st.cuda()

    if c.lr_decay:
        scheduler = NoamLR(
            optimizer,
            warmup_steps=c.warmup_steps,
            last_epoch=args.restore_step - 1)
    else:
        scheduler = None

    num_params = count_parameters(model)
    print(" | > Model has {} parameters".format(num_params), flush=True)

    if not os.path.exists(CHECKPOINT_PATH):
        os.mkdir(CHECKPOINT_PATH)

    if 'best_loss' not in locals():
        best_loss = float('inf')

    for epoch in range(0, c.epochs):
        train_loss, current_step = train(model, criterion, criterion_st,
                                         optimizer, optimizer_st,
                                         scheduler, ap, epoch)
        val_loss = evaluate(model, criterion, criterion_st, ap,
                            current_step)
        print(
            " | > Train Loss: {:.5f}   Validation Loss: {:.5f}".format(
                train_loss, val_loss),
            flush=True)
        best_loss = save_best_model(model, optimizer, train_loss, best_loss,
                                    OUT_PATH, current_step, epoch)
Exemplo n.º 7
0
def main(args):
    # DISTRUBUTED
    if num_gpus > 1:
        init_distributed(args.rank, num_gpus, args.group_id,
                         c.distributed["backend"], c.distributed["url"])
    num_chars = len(phonemes) if c.use_phonemes else len(symbols)
    model = Tacotron(num_chars=num_chars,
                     embedding_dim=c.embedding_size,
                     linear_dim=ap.num_freq,
                     mel_dim=ap.num_mels,
                     r=c.r,
                     memory_size=c.memory_size)

    optimizer = optim.Adam(model.parameters(), lr=c.lr, weight_decay=0)
    optimizer_st = optim.Adam(model.decoder.stopnet.parameters(),
                              lr=c.lr,
                              weight_decay=0)

    criterion = L1LossMasked()
    criterion_st = nn.BCELoss()

    if args.restore_path:
        checkpoint = torch.load(args.restore_path)
        try:
            model.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
        except:
            print(" > Partial model initialization.")
            partial_init_flag = True
            model_dict = model.state_dict()
            # Partial initialization: if there is a mismatch with new and old layer, it is skipped.
            # 1. filter out unnecessary keys
            pretrained_dict = {
                k: v
                for k, v in checkpoint['model'].items() if k in model_dict
            }
            # 2. filter out different size layers
            pretrained_dict = {
                k: v
                for k, v in pretrained_dict.items()
                if v.numel() == model_dict[k].numel()
            }
            # 3. overwrite entries in the existing state dict
            model_dict.update(pretrained_dict)
            # 4. load the new state dict
            model.load_state_dict(model_dict)
            print(" | > {} / {} layers are initialized".format(
                len(pretrained_dict), len(model_dict)))
        if use_cuda:
            model = model.cuda()
            criterion.cuda()
            criterion_st.cuda()
        for group in optimizer.param_groups:
            group['lr'] = c.lr
        print(" > Model restored from step %d" % checkpoint['step'],
              flush=True)
        start_epoch = checkpoint['epoch']
        best_loss = checkpoint['linear_loss']
        args.restore_step = checkpoint['step']
    else:
        args.restore_step = 0
        if use_cuda:
            model = model.cuda()
            criterion.cuda()
            criterion_st.cuda()

    # DISTRUBUTED
    if num_gpus > 1:
        model = apply_gradient_allreduce(model)

    if c.lr_decay:
        scheduler = NoamLR(optimizer,
                           warmup_steps=c.warmup_steps,
                           last_epoch=args.restore_step - 1)
    else:
        scheduler = None

    num_params = count_parameters(model)
    print("\n > Model has {} parameters".format(num_params), flush=True)

    if 'best_loss' not in locals():
        best_loss = float('inf')

    for epoch in range(0, c.epochs):
        train_loss, current_step = train(model, criterion, criterion_st,
                                         optimizer, optimizer_st, scheduler,
                                         ap, epoch)
        val_loss = evaluate(model, criterion, criterion_st, ap, current_step,
                            epoch)
        print(" | > Training Loss: {:.5f}   Validation Loss: {:.5f}".format(
            train_loss, val_loss),
              flush=True)
        target_loss = train_loss
        if c.run_eval:
            target_loss = val_loss
        best_loss = save_best_model(model, optimizer, target_loss, best_loss,
                                    OUT_PATH, current_step, epoch)
Exemplo n.º 8
0
def main(args):
    dataset = importlib.import_module('datasets.' + c.dataset)
    Dataset = getattr(dataset, 'MyDataset')
    audio = importlib.import_module('utils.' + c.audio_processor)
    AudioProcessor = getattr(audio, 'AudioProcessor')

    ap = AudioProcessor(sample_rate=c.sample_rate,
                        num_mels=c.num_mels,
                        min_level_db=c.min_level_db,
                        frame_shift_ms=c.frame_shift_ms,
                        frame_length_ms=c.frame_length_ms,
                        ref_level_db=c.ref_level_db,
                        num_freq=c.num_freq,
                        power=c.power,
                        preemphasis=c.preemphasis)

    # Setup the dataset
    train_dataset = Dataset(c.data_path,
                            c.meta_file_train,
                            c.r,
                            c.text_cleaner,
                            ap=ap,
                            min_seq_len=c.min_seq_len)

    train_loader = DataLoader(train_dataset,
                              batch_size=c.batch_size,
                              shuffle=False,
                              collate_fn=train_dataset.collate_fn,
                              drop_last=False,
                              num_workers=c.num_loader_workers,
                              pin_memory=True)

    if c.run_eval:
        val_dataset = Dataset(c.data_path,
                              c.meta_file_val,
                              c.r,
                              c.text_cleaner,
                              ap=ap)

        val_loader = DataLoader(val_dataset,
                                batch_size=c.eval_batch_size,
                                shuffle=False,
                                collate_fn=val_dataset.collate_fn,
                                drop_last=False,
                                num_workers=4,
                                pin_memory=True)
    else:
        val_loader = None

    model = Tacotron(c.embedding_size, ap.num_freq, c.num_mels, c.r)
    print(" | > Num output units : {}".format(ap.num_freq), flush=True)

    optimizer = optim.Adam(model.parameters(), lr=c.lr)
    optimizer_st = optim.Adam(model.decoder.stopnet.parameters(), lr=c.lr)

    criterion = L1LossMasked()
    criterion_st = nn.BCELoss()

    if args.restore_path:
        checkpoint = torch.load(args.restore_path)
        model.load_state_dict(checkpoint['model'])
        if use_cuda:
            model = model.cuda()
            criterion.cuda()
            criterion_st.cuda()
        optimizer.load_state_dict(checkpoint['optimizer'])
        # optimizer_st.load_state_dict(checkpoint['optimizer_st'])
        for state in optimizer.state.values():
            for k, v in state.items():
                if torch.is_tensor(v):
                    state[k] = v.cuda()
        print(" > Model restored from step %d" % checkpoint['step'],
              flush=True)
        start_epoch = checkpoint['step'] // len(train_loader)
        best_loss = checkpoint['linear_loss']
        args.restore_step = checkpoint['step']
    else:
        args.restore_step = 0
        print("\n > Starting a new training", flush=True)
        if use_cuda:
            model = model.cuda()
            criterion.cuda()
            criterion_st.cuda()

    scheduler = AnnealLR(optimizer, warmup_steps=c.warmup_steps)
    num_params = count_parameters(model)
    print(" | > Model has {} parameters".format(num_params), flush=True)

    if not os.path.exists(CHECKPOINT_PATH):
        os.mkdir(CHECKPOINT_PATH)

    if 'best_loss' not in locals():
        best_loss = float('inf')

    for epoch in range(0, c.epochs):
        train_loss, current_step = train(model, criterion, criterion_st,
                                         train_loader, optimizer, optimizer_st,
                                         scheduler, ap, epoch)
        val_loss = evaluate(model, criterion, criterion_st, val_loader, ap,
                            current_step)
        print(" | > Train Loss: {:.5f}   Validation Loss: {:.5f}".format(
            train_loss, val_loss),
              flush=True)
        best_loss = save_best_model(model, optimizer, train_loss, best_loss,
                                    OUT_PATH, current_step, epoch)
Exemplo n.º 9
0
def main(args):

    # setup output paths and read configs
    c = load_config(args.config_path)
    _ = os.path.dirname(os.path.realpath(__file__))
    OUT_PATH = os.path.join(_, c.output_path)
    OUT_PATH = create_experiment_folder(OUT_PATH)
    CHECKPOINT_PATH = os.path.join(OUT_PATH, 'checkpoints')
    shutil.copyfile(args.config_path, os.path.join(OUT_PATH, 'config.json'))

    # Ctrl+C handler to remove empty experiment folder
    def signal_handler(signal, frame):
        print(" !! Pressed Ctrl+C !!")
        remove_experiment_folder(OUT_PATH)
        sys.exit(0)

    signal.signal(signal.SIGINT, signal_handler)

    dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata.csv'),
                              os.path.join(c.data_path, 'wavs'), c.r,
                              c.sample_rate, c.text_cleaner)

    model = Tacotron(c.embedding_size, c.hidden_size, c.num_mels, c.num_freq,
                     c.r)
    if use_cuda:
        model = nn.DataParallel(model.cuda())

    optimizer = optim.Adam(model.parameters(), lr=c.lr)

    try:
        checkpoint = torch.load(
            os.path.join(CHECKPOINT_PATH,
                         'checkpoint_%d.pth.tar' % args.restore_step))
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        print("\n > Model restored from step %d\n" % args.restore_step)

    except:
        print("\n > Starting a new training\n")

    model = model.train()

    if not os.path.exists(CHECKPOINT_PATH):
        os.mkdir(CHECKPOINT_PATH)

    if use_cuda:
        criterion = nn.L1Loss().cuda()
    else:
        criterion = nn.L1Loss()

    n_priority_freq = int(3000 / (c.sample_rate * 0.5) * c.num_freq)

    for epoch in range(c.epochs):

        dataloader = DataLoader(dataset,
                                batch_size=c.batch_size,
                                shuffle=True,
                                collate_fn=dataset.collate_fn,
                                drop_last=True,
                                num_workers=32)
        progbar = Progbar(len(dataset) / c.batch_size)

        for i, data in enumerate(dataloader):
            text_input = data[0]
            magnitude_input = data[1]
            mel_input = data[2]

            current_step = i + args.restore_step + epoch * len(dataloader) + 1

            optimizer.zero_grad()

            try:
                mel_input = np.concatenate(
                    (np.zeros([c.batch_size, 1, c.num_mels],
                              dtype=np.float32), mel_input[:, 1:, :]),
                    axis=1)
            except:
                raise TypeError("not same dimension")

            if use_cuda:
                text_input_var = Variable(torch.from_numpy(text_input).type(
                    torch.cuda.LongTensor),
                                          requires_grad=False).cuda()
                mel_input_var = Variable(torch.from_numpy(mel_input).type(
                    torch.cuda.FloatTensor),
                                         requires_grad=False).cuda()
                mel_spec_var = Variable(torch.from_numpy(mel_input).type(
                    torch.cuda.FloatTensor),
                                        requires_grad=False).cuda()
                linear_spec_var = Variable(
                    torch.from_numpy(magnitude_input).type(
                        torch.cuda.FloatTensor),
                    requires_grad=False).cuda()

            else:
                text_input_var = Variable(torch.from_numpy(text_input).type(
                    torch.LongTensor),
                                          requires_grad=False)
                mel_input_var = Variable(torch.from_numpy(mel_input).type(
                    torch.FloatTensor),
                                         requires_grad=False)
                mel_spec_var = Variable(torch.from_numpy(mel_input).type(
                    torch.FloatTensor),
                                        requires_grad=False)
                linear_spec_var = Variable(
                    torch.from_numpy(magnitude_input).type(torch.FloatTensor),
                    requires_grad=False)

            mel_output, linear_output, alignments =\
                model.forward(text_input_var, mel_input_var)

            mel_loss = criterion(mel_output, mel_spec_var)
            linear_loss = torch.abs(linear_output - linear_spec_var)
            linear_loss = 0.5 * \
                torch.mean(linear_loss) + 0.5 * \
                torch.mean(linear_loss[:, :n_priority_freq, :])
            loss = mel_loss + linear_loss
            loss = loss.cuda()

            start_time = time.time()

            loss.backward()

            nn.utils.clip_grad_norm(model.parameters(), 1.)

            optimizer.step()

            time_per_step = time.time() - start_time
            progbar.update(i,
                           values=[('total_loss', loss.data[0]),
                                   ('linear_loss', linear_loss.data[0]),
                                   ('mel_loss', mel_loss.data[0])])

            if current_step % c.save_step == 0:
                checkpoint_path = 'checkpoint_{}.pth.tar'.format(current_step)
                checkpoint_path = os.path.join(OUT_PATH, checkpoint_path)
                save_checkpoint(
                    {
                        'model': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'step': current_step,
                        'total_loss': loss.data[0],
                        'linear_loss': linear_loss.data[0],
                        'mel_loss': mel_loss.data[0],
                        'date': datetime.date.today().strftime("%B %d, %Y")
                    }, checkpoint_path)
                print(" > Checkpoint is saved : {}".format(checkpoint_path))

            if current_step in c.decay_step:
                optimizer = adjust_learning_rate(optimizer, current_step)
Exemplo n.º 10
0
def main(args):

    # Setup the dataset
    train_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_train.csv'),
                              os.path.join(c.data_path, 'wavs'),
                              c.r,
                              c.sample_rate,
                              c.text_cleaner,
                              c.num_mels,
                              c.min_level_db,
                              c.frame_shift_ms,
                              c.frame_length_ms,
                              c.preemphasis,
                              c.ref_level_db,
                              c.num_freq,
                              c.power
                             )

    train_loader = DataLoader(train_dataset, batch_size=c.batch_size,
                            shuffle=False, collate_fn=train_dataset.collate_fn,
                            drop_last=False, num_workers=c.num_loader_workers,
                            pin_memory=True)
    
    val_dataset = LJSpeechDataset(os.path.join(c.data_path, 'metadata_val.csv'),
                              os.path.join(c.data_path, 'wavs'),
                              c.r,
                              c.sample_rate,
                              c.text_cleaner,
                              c.num_mels,
                              c.min_level_db,
                              c.frame_shift_ms,
                              c.frame_length_ms,
                              c.preemphasis,
                              c.ref_level_db,
                              c.num_freq,
                              c.power
                             )

    val_loader = DataLoader(val_dataset, batch_size=c.batch_size,
                            shuffle=False, collate_fn=val_dataset.collate_fn,
                            drop_last=False, num_workers= 4,
                            pin_memory=True)

    model = Tacotron(c.embedding_size,
                     c.hidden_size,
                     c.num_mels,
                     c.num_freq,
                     c.r,
                     use_atten_mask=True)

    optimizer = optim.Adam(model.parameters(), lr=c.lr)
    
    if use_cuda:
        criterion = nn.L1Loss().cuda()
    else:
        criterion = nn.L1Loss()

    if args.restore_path:
        checkpoint = torch.load(args.restore_path)
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        print("\n > Model restored from step %d\n" % checkpoint['step'])
        start_epoch = checkpoint['step'] // len(train_loader)
        best_loss = checkpoint['linear_loss']
        start_epoch = 0
        args.restore_step = checkpoint['step']
    else:
        args.restore_step = 0
        print("\n > Starting a new training")

    if use_cuda:
        model = nn.DataParallel(model.cuda())

    num_params = count_parameters(model)
    print(" | > Model has {} parameters".format(num_params))
    
    if not os.path.exists(CHECKPOINT_PATH):
        os.mkdir(CHECKPOINT_PATH)
    
    if 'best_loss' not in locals():
        best_loss = float('inf')
    
    for epoch in range(0, c.epochs):
        train_loss, current_step = train(model, criterion, train_loader, optimizer, epoch)
        val_loss = evaluate(model, criterion, val_loader, current_step)
        best_loss = save_best_model(model, optimizer, val_loss,
                                    best_loss, OUT_PATH,
                                    current_step, epoch)
Exemplo n.º 11
0
    ap = AudioProcessor(**c.audio)
    use_cuda = True
    
    data_loader = setup_loader(c, is_val=False)
    num_chars = len(phonemes) if c.use_phonemes else len(symbols)
    
    model = Tacotron(
        num_chars=num_chars,
        embedding_dim=c.embedding_size,
        linear_dim=ap.num_freq,
        mel_dim=ap.num_mels,
        r=c.r,
        memory_size=c.memory_size)

    checkpoint = torch.load(MODEL_PATH)
    model.load_state_dict(checkpoint['model'])
    if use_cuda:
        model.cuda()

    with torch.no_grad():
        if data_loader is not None:
            for num_iter, data in enumerate(data_loader):
                # print(num_iter * batch_size)
                start_time = time.time()

                # setup input data
                text_input = data[0]
                text_lengths = data[1]
                linear_input = data[2]
                mel_input = data[3]
                mel_lengths = data[4]
Exemplo n.º 12
0
num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
model = Tacotron(num_chars, CONFIG.embedding_size, ap.num_freq, ap.num_mels, CONFIG.r)


texts = []

with open(args.text) as f:
    for line in f:
        texts.append(line.strip())

if use_cuda:
    cp = torch.load(MODEL_PATH)
else:
    cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)

model.load_state_dict(cp['model'])
if use_cuda:
    model.cuda()
model.eval()
model.decoder.max_decoder_steps = 800
batch_size = 32

for n in range(math.ceil(len(texts) / batch_size)):
    batch_texts = texts[n: max(n + batch_size, len(texts))]
    wavs, alignments = text2audio(texts, model, CONFIG, use_cuda, ap)
    for i, wav in enumerate(wavs):
        ap.save_wav(wav, os.path.join(OUT_FOLDER, 'CommonVoice_{}_{}.wav'.format(args.step, n * batch_size + i)))

        if save_alignment:
        # alignments can be used to train FastSpeech
            alignment = alignments[i]
Exemplo n.º 13
0
class Synthesizer(object):
    """
    Summary:
        Config is loaded and the model from the given path is loaded and prepared for inference.

    Parameters:
        @model_path = model's file directory path
        @model_name = model's file name
        @model_config = config's file name
        @use_cuda = GPU flag
    """
    def load_model(self, model_path, model_name, model_config, use_cuda):

        #build the config's path
        model_config = os.path.join(model_path, model_config)

        #build the model's path
        model_file = os.path.join(model_path, model_name)
        print(" > Loading model ...")
        print(" | > Model config path: ", model_config)
        print(" | > Model file path: ", model_file)

        config = load_config(model_config)
        self.use_cuda = use_cuda
        self.use_phonemes = config.use_phonemes
        self.ap = AudioProcessor(**config.audio)

        if self.use_phonemes:
            self.input_size = len(phonemes)
            self.input_adapter = lambda sen: phoneme_to_sequence(
                sen, [config.text_cleaner], config.phoneme_language)
        else:
            self.input_size = len(symbols)
            self.input_adapter = lambda sen: text_to_sequence(
                sen, [config.text_cleaner])

        self.model = Tacotron(num_chars=config['num_chars'],
                              embedding_dim=config['embedding_size'],
                              linear_dim=self.ap.num_freq,
                              mel_dim=self.ap.num_mels,
                              r=config['r'])

        #load model state
        if use_cuda:
            cp = torch.load(model_file)
        else:
            cp = torch.load(model_file,
                            map_location=lambda storage, loc: storage)

        #load the model
        self.model.load_state_dict(cp['model'])

        #if cuda is enabled & available move tensors to GPU
        if use_cuda:
            self.model.cuda()

        #disables normalization techniques present in code
        self.model.eval()

    """
    Summary:
        Saves the wav at the given path

    Parameters:
        @wav = wav array
        @path = destination path
    """

    def save_wav(self, wav, path):
        # wav *= 32767 / max(1e-8, np.max(np.abs(wav)))
        wav = np.array(wav)
        self.ap.save_wav(wav, path)

    """
    Summary:
        Gets an input, prepares it for the model and returns the predicted output.

    Parameters:
        @text = input sentence 
    """

    def tts(self, text, gl_mode=None):

        wavs = []

        #split the input in sentences
        for sen in text.split('.'):

            if len(sen) < 3:
                continue

            sen = sen.strip()
            sen += '.'
            #print('Input : {}'.format(sen))

            #character => phonem => index
            seq = np.array(self.input_adapter(sen))

            #numpy to pytorch array
            chars_var = torch.from_numpy(seq).unsqueeze(0).long()

            if self.use_cuda:
                chars_var = chars_var.cuda()

            #begin the inference
            mel_out, linear_out, alignments, stop_tokens = self.model.forward(
                chars_var)

            #move output tensor to cpu
            linear_out = linear_out[0].data.cpu().numpy()
            t = time.time()
            wav = self.ap.inv_spectrogram(linear_out.T, gl_mode)
            t = time.time() - t
            wavs += list(wav)
            wavs += [0] * 10000

        out = io.BytesIO()
        self.save_wav(wavs, out)
        self.save_wav(wavs, 'gla.wav')
        return out