Пример #1
0
    def _initialize(self):
        """
        initialize with the necessary elements
        """
        self.tts_checkpoint_path = os.path.join(self.directory, "assets",
                                                "tts", "step-162000")
        self.waveflow_checkpoint_path = os.path.join(self.directory, "assets",
                                                     "vocoder", "step-2000000")
        self.waveflow_config_path = os.path.join(self.directory, "assets",
                                                 "vocoder",
                                                 "waveflow_ljspeech.yaml")

        tts_config_path = os.path.join(self.directory, "assets", "tts",
                                       "ljspeech.yaml")
        with open(tts_config_path) as f:
            self.tts_config = yaml.load(f, Loader=yaml.Loader)
        with fluid.dygraph.guard(fluid.CPUPlace()):
            self.tts_model = FastSpeechModel(
                self.tts_config['network'],
                num_mels=self.tts_config['audio']['num_mels'])
            io.load_parameters(model=self.tts_model,
                               checkpoint_path=self.tts_checkpoint_path)

            # Build vocoder.
            args = AttrDict()
            args.config = self.waveflow_config_path
            args.use_fp16 = False
            self.waveflow_config = io.add_yaml_config_to_args(args)
            self.waveflow = WaveFlowModule(self.waveflow_config)
            io.load_parameters(model=self.waveflow,
                               checkpoint_path=self.waveflow_checkpoint_path)
Пример #2
0
    def build(self, training=True):
        """Initialize the model.

        Args:
            training (bool, optional): Whether the model is built for training or inference.
                Defaults to True.

        Returns:
            None
        """
        config = self.config
        dataset = LJSpeech(config, self.nranks, self.rank)
        self.trainloader = dataset.trainloader
        self.validloader = dataset.validloader

        waveflow = WaveFlowModule(config)

        # Dry run once to create and initalize all necessary parameters.
        audio = dg.to_variable(np.random.randn(1, 16000).astype(self.dtype))
        mel = dg.to_variable(
            np.random.randn(1, config.mel_bands, 63).astype(self.dtype))
        waveflow(audio, mel)

        if training:
            optimizer = fluid.optimizer.AdamOptimizer(
                learning_rate=config.learning_rate,
                parameter_list=waveflow.parameters())

            # Load parameters.
            iteration = io.load_parameters(model=waveflow,
                                           optimizer=optimizer,
                                           checkpoint_dir=self.checkpoint_dir,
                                           iteration=config.iteration,
                                           checkpoint_path=config.checkpoint)
            print("Rank {}: checkpoint loaded.".format(self.rank))

            # Data parallelism.
            if self.parallel:
                strategy = dg.parallel.prepare_context()
                waveflow = dg.parallel.DataParallel(waveflow, strategy)

            self.waveflow = waveflow
            self.optimizer = optimizer
            self.criterion = WaveFlowLoss(config.sigma)

        else:
            # Load parameters.
            iteration = io.load_parameters(model=waveflow,
                                           checkpoint_dir=self.checkpoint_dir,
                                           iteration=config.iteration,
                                           checkpoint_path=config.checkpoint)
            print("Rank {}: checkpoint loaded.".format(self.rank))

            for layer in waveflow.sublayers():
                if isinstance(layer, weight_norm.WeightNormWrapper):
                    layer.remove_weight_norm()

            self.waveflow = waveflow

        return iteration
Пример #3
0
    def __init__(self, config_path, checkpoint_path):
        with open(config_path, 'rt') as f:
            config = ruamel.yaml.safe_load(f)
        ns = argparse.Namespace()
        for k, v in config.items():
            setattr(ns, k, v)
        ns.use_fp16 = False

        self.model = WaveFlowModule(ns)
        io.load_parameters(self.model, checkpoint_path=checkpoint_path)
Пример #4
0
 def __init__(self):
     config_path = "waveflow_res128_ljspeech_ckpt_1.0/waveflow_ljspeech.yaml"
     with open(config_path, 'rt') as f:
        config = yaml.safe_load(f)
     ns = argparse.Namespace()
     for k, v in config.items():
         setattr(ns, k, v)
     ns.use_fp16 = False
     
     self.model = WaveFlowModule(ns)
     checkpoint_path = "waveflow_res128_ljspeech_ckpt_1.0/step-2000000"
     load_parameters(self.model, checkpoint_path=checkpoint_path)
Пример #5
0
def synthesis(text_input, args):
    local_rank = dg.parallel.Env().local_rank
    place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())
    fluid.enable_dygraph(place)

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    # tensorboard
    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = SummaryWriter(os.path.join(args.output, 'log'))

    model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels'])
    # Load parameters.
    global_step = io.load_parameters(model=model,
                                     checkpoint_path=args.checkpoint)
    model.eval()

    text = np.asarray(text_to_sequence(text_input))
    text = np.expand_dims(text, axis=0)
    pos_text = np.arange(1, text.shape[1] + 1)
    pos_text = np.expand_dims(pos_text, axis=0)

    text = dg.to_variable(text)
    pos_text = dg.to_variable(pos_text)

    _, mel_output_postnet = model(text, pos_text, alpha=args.alpha)

    result = np.exp(mel_output_postnet.numpy())
    mel_output_postnet = fluid.layers.transpose(
        fluid.layers.squeeze(mel_output_postnet, [0]), [1, 0])
    mel_output_postnet = np.exp(mel_output_postnet.numpy())
    basis = librosa.filters.mel(cfg['audio']['sr'], cfg['audio']['n_fft'],
                                cfg['audio']['num_mels'])
    inv_basis = np.linalg.pinv(basis)
    spec = np.maximum(1e-10, np.dot(inv_basis, mel_output_postnet))

    # synthesis use clarinet
    wav_clarinet = synthesis_with_clarinet(args.config_clarinet,
                                           args.checkpoint_clarinet, result,
                                           place)
    writer.add_audio(text_input + '(clarinet)', wav_clarinet, 0,
                     cfg['audio']['sr'])
    if not os.path.exists(os.path.join(args.output, 'samples')):
        os.mkdir(os.path.join(args.output, 'samples'))
    write(os.path.join(os.path.join(args.output, 'samples'), 'clarinet.wav'),
          cfg['audio']['sr'], wav_clarinet)

    #synthesis use griffin-lim
    wav = librosa.core.griffinlim(spec**cfg['audio']['power'],
                                  hop_length=cfg['audio']['hop_length'],
                                  win_length=cfg['audio']['win_length'])
    writer.add_audio(text_input + '(griffin-lim)', wav, 0, cfg['audio']['sr'])
    write(
        os.path.join(os.path.join(args.output, 'samples'), 'grinffin-lim.wav'),
        cfg['audio']['sr'], wav)
    print("Synthesis completed !!!")
    writer.close()
Пример #6
0
def main(args, config):
    model = create_model(config)
    loaded_step = load_parameters(model, checkpoint_path=args.checkpoint)
    model.eval()
    if args.vocoder == "waveflow":
        vocoder = WaveflowVocoder()
        vocoder.model.eval()
    elif args.vocoder == "griffin-lim":
        vocoder = GriffinLimVocoder(
            sharpening_factor=config["sharpening_factor"],
            sample_rate=config["sample_rate"],
            n_fft=config["n_fft"],
            win_length=config["win_length"],
            hop_length=config["hop_length"])
    else:
        raise ValueError("Other vocoders are not supported.")

    if not os.path.exists(args.output):
        os.makedirs(args.output)
    monotonic_layers = [
        int(item.strip()) - 1 for item in args.monotonic_layers.split(',')
    ]
    with open(args.input, 'rt') as f:
        sentences = [line.strip() for line in f.readlines()]
    for i, sentence in enumerate(sentences):
        wav = synthesize(args, config, model, vocoder, sentence,
                         monotonic_layers)
        sf.write(os.path.join(args.output, "sentence{}.wav".format(i)),
                 wav,
                 samplerate=config["sample_rate"])
Пример #7
0
    def _initialize(self):
        """
        initialize with the necessary elements
        """
        self.tts_checkpoint_path = os.path.join(self.directory, "assets",
                                                "tts", "step-120000")
        self.waveflow_checkpoint_path = os.path.join(self.directory, "assets",
                                                     "vocoder", "step-2000000")
        self.waveflow_config_path = os.path.join(self.directory, "assets",
                                                 "vocoder",
                                                 "waveflow_ljspeech.yaml")

        tts_config_path = os.path.join(self.directory, "assets", "tts",
                                       "ljspeech.yaml")
        with open(tts_config_path) as f:
            self.tts_config = yaml.load(f, Loader=yaml.Loader)

        # The max length of audio when synthsis.
        self.max_len = 1000
        # The threshold of stop token which indicates the time step should stop generate spectrum or not.
        self.stop_threshold = 0.5

        with fluid.dygraph.guard(fluid.CPUPlace()):
            # Build TTS.
            with fluid.unique_name.guard():
                network_cfg = self.tts_config['network']
                self.tts_model = TransformerTTSModel(
                    network_cfg['embedding_size'], network_cfg['hidden_size'],
                    network_cfg['encoder_num_head'],
                    network_cfg['encoder_n_layers'],
                    self.tts_config['audio']['num_mels'],
                    network_cfg['outputs_per_step'],
                    network_cfg['decoder_num_head'],
                    network_cfg['decoder_n_layers'])
                io.load_parameters(model=self.tts_model,
                                   checkpoint_path=self.tts_checkpoint_path)

            # Build vocoder.
            args = AttrDict()
            args.config = self.waveflow_config_path
            args.use_fp16 = False
            self.waveflow_config = io.add_yaml_config_to_args(args)
            self.waveflow = WaveFlowModule(self.waveflow_config)
            io.load_parameters(model=self.waveflow,
                               checkpoint_path=self.waveflow_checkpoint_path)
Пример #8
0
def synthesis_with_waveflow(mel_output, args, checkpoint, place):

    fluid.enable_dygraph(place)
    args.config = args.config_vocoder
    args.use_fp16 = False
    config = io.add_yaml_config_to_args(args)

    mel_spectrogram = fluid.layers.transpose(mel_output, [0, 2, 1])

    # Build model.
    waveflow = WaveFlowModule(config)
    io.load_parameters(model=waveflow, checkpoint_path=checkpoint)
    for layer in waveflow.sublayers():
        if isinstance(layer, weight_norm.WeightNormWrapper):
            layer.remove_weight_norm()

    # Run model inference.
    wav = waveflow.synthesize(mel_spectrogram, sigma=config.sigma)
    return wav.numpy()[0]
Пример #9
0
def synthesis(text_input, args):
    local_rank = dg.parallel.Env().local_rank
    place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())
    fluid.enable_dygraph(place)

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    # tensorboard
    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = SummaryWriter(os.path.join(args.output, 'log'))

    model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels'])
    # Load parameters.
    global_step = io.load_parameters(
        model=model, checkpoint_path=args.checkpoint)
    model.eval()

    text = np.asarray(text_to_sequence(text_input))
    text = np.expand_dims(text, axis=0)
    pos_text = np.arange(1, text.shape[1] + 1)
    pos_text = np.expand_dims(pos_text, axis=0)

    text = dg.to_variable(text).astype(np.int64)
    pos_text = dg.to_variable(pos_text).astype(np.int64)

    _, mel_output_postnet = model(text, pos_text, alpha=args.alpha)

    if args.vocoder == 'griffin-lim':
        #synthesis use griffin-lim
        wav = synthesis_with_griffinlim(mel_output_postnet, cfg['audio'])
    elif args.vocoder == 'waveflow':
        wav = synthesis_with_waveflow(mel_output_postnet, args,
                                      args.checkpoint_vocoder, place)
    else:
        print(
            'vocoder error, we only support griffinlim and waveflow, but recevied %s.'
            % args.vocoder)

    writer.add_audio(text_input + '(' + args.vocoder + ')', wav, 0,
                     cfg['audio']['sr'])
    if not os.path.exists(os.path.join(args.output, 'samples')):
        os.mkdir(os.path.join(args.output, 'samples'))
    write(
        os.path.join(
            os.path.join(args.output, 'samples'), args.vocoder + '.wav'),
        cfg['audio']['sr'], wav)
    print("Synthesis completed !!!")
    writer.close()
Пример #10
0
def alignments(args):
    local_rank = dg.parallel.Env().local_rank
    place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    with dg.guard(place):
        network_cfg = cfg['network']
        model = TransformerTTS(
            network_cfg['embedding_size'], network_cfg['hidden_size'],
            network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
            cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
            network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])
        # Load parameters.
        global_step = io.load_parameters(
            model=model, checkpoint_path=args.checkpoint_transformer)
        model.eval()

        # get text data
        root = Path(args.data)
        csv_path = root.joinpath("metadata.csv")
        table = pd.read_csv(csv_path,
                            sep="|",
                            header=None,
                            quoting=csv.QUOTE_NONE,
                            names=["fname", "raw_text", "normalized_text"])
        ljspeech_processor = audio.AudioProcessor(
            sample_rate=cfg['audio']['sr'],
            num_mels=cfg['audio']['num_mels'],
            min_level_db=cfg['audio']['min_level_db'],
            ref_level_db=cfg['audio']['ref_level_db'],
            n_fft=cfg['audio']['n_fft'],
            win_length=cfg['audio']['win_length'],
            hop_length=cfg['audio']['hop_length'],
            power=cfg['audio']['power'],
            preemphasis=cfg['audio']['preemphasis'],
            signal_norm=True,
            symmetric_norm=False,
            max_norm=1.,
            mel_fmin=0,
            mel_fmax=None,
            clip_norm=True,
            griffin_lim_iters=60,
            do_trim_silence=False,
            sound_norm=False)

        pbar = tqdm(range(len(table)))
        alignments = OrderedDict()
        for i in pbar:
            fname, raw_text, normalized_text = table.iloc[i]
            # init input
            text = np.asarray(text_to_sequence(normalized_text))
            text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
            pos_text = np.arange(1, text.shape[1] + 1)
            pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
            wav = ljspeech_processor.load_wav(
                os.path.join(args.data, 'wavs', fname + ".wav"))
            mel_input = ljspeech_processor.melspectrogram(wav).astype(
                np.float32)
            mel_input = np.transpose(mel_input, axes=(1, 0))
            mel_input = fluid.layers.unsqueeze(dg.to_variable(mel_input), [0])
            mel_lens = mel_input.shape[1]

            dec_slf_mask = get_triu_tensor(mel_input,
                                           mel_input).astype(np.float32)
            dec_slf_mask = np.expand_dims(dec_slf_mask, axis=0)
            dec_slf_mask = fluid.layers.cast(dg.to_variable(dec_slf_mask != 0),
                                             np.float32) * (-2**32 + 1)
            pos_mel = np.arange(1, mel_input.shape[1] + 1)
            pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
            mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
                text, mel_input, pos_text, pos_mel, dec_slf_mask)
            mel_input = fluid.layers.concat(
                [mel_input, postnet_pred[:, -1:, :]], axis=1)

            alignment, _ = get_alignment(attn_probs, mel_lens,
                                         network_cfg['decoder_num_head'])
            alignments[fname] = alignment
        with open(args.output + '.txt', "wb") as f:
            pickle.dump(alignments, f)
Пример #11
0
        n_loop = model_config["n_loop"]
        n_layer = model_config["n_layer"]
        residual_channels = model_config["residual_channels"]
        output_dim = model_config["output_dim"]
        loss_type = model_config["loss_type"]
        log_scale_min = model_config["log_scale_min"]
        decoder = WaveNet(n_loop, n_layer, residual_channels, output_dim,
                          n_mels, filter_size, loss_type, log_scale_min)

        model = ConditionalWavenet(encoder, decoder)
        summary(model)

        # load model parameters
        checkpoint_dir = os.path.join(args.output, "checkpoints")
        if args.checkpoint:
            iteration = io.load_parameters(model,
                                           checkpoint_path=args.checkpoint)
        else:
            iteration = io.load_parameters(model,
                                           checkpoint_dir=checkpoint_dir,
                                           iteration=args.iteration)
        assert iteration > 0, "A trained model is needed."

        # WARNING: don't forget to remove weight norm to re-compute each wrapped layer's weight
        # removing weight norm also speeds up computation
        for layer in model.sublayers():
            if isinstance(layer, WeightNormWrapper):
                layer.remove_weight_norm()

        train_loader = fluid.io.DataLoader.from_generator(capacity=10,
                                                          return_list=True)
        train_loader.set_batch_generator(train_cargo, place)
Пример #12
0
def main(args):
    local_rank = dg.parallel.Env().local_rank
    nranks = dg.parallel.Env().nranks
    parallel = nranks > 1

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    global_step = 0
    place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()

    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = LogWriter(os.path.join(args.output,
                                    'log')) if local_rank == 0 else None

    fluid.enable_dygraph(place)
    model = Vocoder(cfg['train']['batch_size'], cfg['vocoder']['hidden_size'],
                    cfg['audio']['num_mels'], cfg['audio']['n_fft'])

    model.train()
    optimizer = fluid.optimizer.AdamOptimizer(
        learning_rate=dg.NoamDecay(
            1 / (cfg['train']['warm_up_step'] *
                 (cfg['train']['learning_rate']**2)),
            cfg['train']['warm_up_step']),
        parameter_list=model.parameters(),
        grad_clip=fluid.clip.GradientClipByGlobalNorm(
            cfg['train']['grad_clip_thresh']))

    # Load parameters.
    global_step = io.load_parameters(model=model,
                                     optimizer=optimizer,
                                     checkpoint_dir=os.path.join(
                                         args.output, 'checkpoints'),
                                     iteration=args.iteration,
                                     checkpoint_path=args.checkpoint)
    print("Rank {}: checkpoint loaded.".format(local_rank))

    if parallel:
        strategy = dg.parallel.prepare_context()
        model = fluid.dygraph.parallel.DataParallel(model, strategy)

    reader = LJSpeechLoader(cfg['audio'],
                            place,
                            args.data,
                            cfg['train']['batch_size'],
                            nranks,
                            local_rank,
                            is_vocoder=True).reader()

    for epoch in range(cfg['train']['max_iteration']):
        pbar = tqdm(reader)
        for i, data in enumerate(pbar):
            pbar.set_description('Processing at epoch %d' % epoch)
            mel, mag = data
            mag = dg.to_variable(mag.numpy())
            mel = dg.to_variable(mel.numpy())
            global_step += 1

            mag_pred = model(mel)
            loss = layers.mean(
                layers.abs(layers.elementwise_sub(mag_pred, mag)))

            if parallel:
                loss = model.scale_loss(loss)
                loss.backward()
                model.apply_collective_grads()
            else:
                loss.backward()
            optimizer.minimize(loss)
            model.clear_gradients()

            if local_rank == 0:
                writer.add_scalar('training_loss/loss', loss.numpy(),
                                  global_step)

            # save checkpoint
            if local_rank == 0 and global_step % cfg['train'][
                    'checkpoint_interval'] == 0:
                io.save_parameters(os.path.join(args.output, 'checkpoints'),
                                   global_step, model, optimizer)

    if local_rank == 0:
        writer.close()
Пример #13
0
def main(args):
    local_rank = dg.parallel.Env().local_rank
    nranks = dg.parallel.Env().nranks
    parallel = nranks > 1

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    global_step = 0
    place = fluid.CUDAPlace(dg.parallel.Env()
                            .dev_id) if args.use_gpu else fluid.CPUPlace()
    fluid.enable_dygraph(place)

    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = SummaryWriter(os.path.join(args.output,
                                        'log')) if local_rank == 0 else None

    model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels'])
    model.train()
    optimizer = fluid.optimizer.AdamOptimizer(
        learning_rate=dg.NoamDecay(1 / (cfg['train']['warm_up_step'] *
                                        (cfg['train']['learning_rate']**2)),
                                   cfg['train']['warm_up_step']),
        parameter_list=model.parameters(),
        grad_clip=fluid.clip.GradientClipByGlobalNorm(cfg['train'][
            'grad_clip_thresh']))
    reader = LJSpeechLoader(
        cfg['audio'],
        place,
        args.data,
        args.alignments_path,
        cfg['train']['batch_size'],
        nranks,
        local_rank,
        shuffle=True).reader
    iterator = iter(tqdm(reader))

    # Load parameters.
    global_step = io.load_parameters(
        model=model,
        optimizer=optimizer,
        checkpoint_dir=os.path.join(args.output, 'checkpoints'),
        iteration=args.iteration,
        checkpoint_path=args.checkpoint)
    print("Rank {}: checkpoint loaded.".format(local_rank))

    if parallel:
        strategy = dg.parallel.prepare_context()
        model = fluid.dygraph.parallel.DataParallel(model, strategy)

    while global_step <= cfg['train']['max_iteration']:
        try:
            batch = next(iterator)
        except StopIteration as e:
            iterator = iter(tqdm(reader))
            batch = next(iterator)

        (character, mel, pos_text, pos_mel, alignment) = batch

        global_step += 1

        #Forward
        result = model(
            character, pos_text, mel_pos=pos_mel, length_target=alignment)
        mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
        mel_loss = layers.mse_loss(mel_output, mel)
        mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
        duration_loss = layers.mean(
            layers.abs(
                layers.elementwise_sub(duration_predictor_output, alignment)))
        total_loss = mel_loss + mel_postnet_loss + duration_loss

        if local_rank == 0:
            writer.add_scalar('mel_loss', mel_loss.numpy(), global_step)
            writer.add_scalar('post_mel_loss',
                              mel_postnet_loss.numpy(), global_step)
            writer.add_scalar('duration_loss',
                              duration_loss.numpy(), global_step)
            writer.add_scalar('learning_rate',
                              optimizer._learning_rate.step().numpy(),
                              global_step)

        if parallel:
            total_loss = model.scale_loss(total_loss)
            total_loss.backward()
            model.apply_collective_grads()
        else:
            total_loss.backward()
        optimizer.minimize(total_loss)
        model.clear_gradients()

        # save checkpoint
        if local_rank == 0 and global_step % cfg['train'][
                'checkpoint_interval'] == 0:
            io.save_parameters(
                os.path.join(args.output, 'checkpoints'), global_step, model,
                optimizer)

    if local_rank == 0:
        writer.close()
Пример #14
0
    def _initialize(self):
        """
        initialize with the necessary elements
        """
        self.tts_checkpoint_path = os.path.join(self.directory, "assets",
                                                "tts", "step-1780000")
        self.waveflow_checkpoint_path = os.path.join(self.directory, "assets",
                                                     "vocoder", "step-2000000")
        self.waveflow_config_path = os.path.join(self.directory, "assets",
                                                 "vocoder",
                                                 "waveflow_ljspeech.yaml")
        tts_checkpoint_path = os.path.join(self.directory, "assets", "tts",
                                           "ljspeech.yaml")
        with open(tts_checkpoint_path) as f:
            self.tts_config = ruamel.yaml.safe_load(f)

        with fluid.dygraph.guard(fluid.CPUPlace()):
            char_embedding = dg.Embedding(
                (en.n_vocab, self.tts_config["char_dim"]))
            multi_speaker = self.tts_config["n_speakers"] > 1
            speaker_embedding = dg.Embedding((self.tts_config["n_speakers"], self.tts_config["speaker_dim"])) \
                if multi_speaker else None
            encoder = Encoder(self.tts_config["encoder_layers"],
                              self.tts_config["char_dim"],
                              self.tts_config["encoder_dim"],
                              self.tts_config["kernel_size"],
                              has_bias=multi_speaker,
                              bias_dim=self.tts_config["speaker_dim"],
                              keep_prob=1.0 - self.tts_config["dropout"])
            decoder = Decoder(
                self.tts_config["n_mels"],
                self.tts_config["reduction_factor"],
                list(self.tts_config["prenet_sizes"]) +
                [self.tts_config["char_dim"]],
                self.tts_config["decoder_layers"],
                self.tts_config["kernel_size"],
                self.tts_config["attention_dim"],
                position_encoding_weight=self.tts_config["position_weight"],
                omega=self.tts_config["position_rate"],
                has_bias=multi_speaker,
                bias_dim=self.tts_config["speaker_dim"],
                keep_prob=1.0 - self.tts_config["dropout"])
            postnet = PostNet(self.tts_config["postnet_layers"],
                              self.tts_config["char_dim"],
                              self.tts_config["postnet_dim"],
                              self.tts_config["kernel_size"],
                              self.tts_config["n_mels"],
                              self.tts_config["reduction_factor"],
                              has_bias=multi_speaker,
                              bias_dim=self.tts_config["speaker_dim"],
                              keep_prob=1.0 - self.tts_config["dropout"])
            self.tts_model = SpectraNet(char_embedding, speaker_embedding,
                                        encoder, decoder, postnet)
            io.load_parameters(model=self.tts_model,
                               checkpoint_path=self.tts_checkpoint_path)
            for name, layer in self.tts_model.named_sublayers():
                try:
                    remove_weight_norm(layer)
                except ValueError:
                    # this layer has not weight norm hook
                    pass

            self.waveflow = WaveflowVocoder(
                config_path=self.waveflow_config_path,
                checkpoint_path=self.waveflow_checkpoint_path)
            self.griffin = GriffinLimVocoder(
                sharpening_factor=self.tts_config["sharpening_factor"],
                sample_rate=self.tts_config["sample_rate"],
                n_fft=self.tts_config["n_fft"],
                win_length=self.tts_config["win_length"],
                hop_length=self.tts_config["hop_length"])
Пример #15
0
        writer = None
    sentences = [
        "Scientists at the CERN laboratory say they have discovered a new particle.",
        "There's a way to measure the acute emotional intelligence that has never gone out of style.",
        "President Trump met with other leaders at the Group of 20 conference.",
        "Generative adversarial network or variational auto-encoder.",
        "Please call Stella.",
        "Some have accepted this as a miracle without any physical explanation.",
    ]
    evaluator = make_evaluator(config, sentences, eval_dir, writer)
    state_saver = make_state_saver(config, state_dir, writer)

    # load parameters and optimizer, and opdate iterations done sofar
    if args.checkpoint is not None:
        iteration = load_parameters(model,
                                    optim,
                                    checkpoint_path=args.checkpoint)
    else:
        iteration = load_parameters(model,
                                    optim,
                                    checkpoint_dir=ckpt_dir,
                                    iteration=args.iteration)

    # =========================train=========================
    train_config = config["train"]
    max_iter = train_config["max_iteration"]
    snap_interval = train_config["snap_interval"]
    save_interval = train_config["save_interval"]
    eval_interval = train_config["eval_interval"]

    global_step = iteration + 1
Пример #16
0
        # =========================link(dataloader, paddle)=========================
        loader = fluid.io.DataLoader.from_generator(
            capacity=10, return_list=True)
        loader.set_batch_generator(ljspeech_loader, places=place)

        # tensorboard & checkpoint preparation
        output_dir = args.output
        ckpt_dir = os.path.join(output_dir, "checkpoints")
        log_dir = os.path.join(output_dir, "log")
        state_dir = os.path.join(output_dir, "states")
        make_output_tree(output_dir)
        writer = SummaryWriter(logdir=log_dir)

        # load parameters and optimizer, and opdate iterations done sofar
        if args.checkpoint is not None:
            iteration = io.load_parameters(
                dv3, optim, checkpoint_path=args.checkpoint)
        else:
            iteration = io.load_parameters(
                dv3, optim, checkpoint_dir=ckpt_dir, iteration=args.iteration)

        # =========================train=========================
        max_iter = train_config["max_iteration"]
        snap_interval = train_config["snap_interval"]
        save_interval = train_config["save_interval"]
        eval_interval = train_config["eval_interval"]

        global_step = iteration + 1
        iterator = iter(tqdm.tqdm(loader))
        while global_step <= max_iter:
            try:
                batch = next(iterator)
Пример #17
0
def synthesis(text_input, args):
    local_rank = dg.parallel.Env().local_rank
    place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    # tensorboard
    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = SummaryWriter(os.path.join(args.output, 'log'))

    fluid.enable_dygraph(place)
    with fluid.unique_name.guard():
        network_cfg = cfg['network']
        model = TransformerTTS(
            network_cfg['embedding_size'], network_cfg['hidden_size'],
            network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
            cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
            network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])
        # Load parameters.
        global_step = io.load_parameters(
            model=model, checkpoint_path=args.checkpoint_transformer)
        model.eval()

    with fluid.unique_name.guard():
        model_vocoder = Vocoder(cfg['train']['batch_size'],
                                cfg['vocoder']['hidden_size'],
                                cfg['audio']['num_mels'],
                                cfg['audio']['n_fft'])
        # Load parameters.
        global_step = io.load_parameters(
            model=model_vocoder, checkpoint_path=args.checkpoint_vocoder)
        model_vocoder.eval()
    # init input
    text = np.asarray(text_to_sequence(text_input))
    text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
    mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32)
    pos_text = np.arange(1, text.shape[1] + 1)
    pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])

    pbar = tqdm(range(args.max_len))
    for i in pbar:
        pos_mel = np.arange(1, mel_input.shape[1] + 1)
        pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
        mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
            text, mel_input, pos_text, pos_mel)
        mel_input = fluid.layers.concat([mel_input, postnet_pred[:, -1:, :]],
                                        axis=1)

    mag_pred = model_vocoder(postnet_pred)

    _ljspeech_processor = audio.AudioProcessor(
        sample_rate=cfg['audio']['sr'],
        num_mels=cfg['audio']['num_mels'],
        min_level_db=cfg['audio']['min_level_db'],
        ref_level_db=cfg['audio']['ref_level_db'],
        n_fft=cfg['audio']['n_fft'],
        win_length=cfg['audio']['win_length'],
        hop_length=cfg['audio']['hop_length'],
        power=cfg['audio']['power'],
        preemphasis=cfg['audio']['preemphasis'],
        signal_norm=True,
        symmetric_norm=False,
        max_norm=1.,
        mel_fmin=0,
        mel_fmax=None,
        clip_norm=True,
        griffin_lim_iters=60,
        do_trim_silence=False,
        sound_norm=False)

    # synthesis with cbhg
    wav = _ljspeech_processor.inv_spectrogram(
        fluid.layers.transpose(fluid.layers.squeeze(mag_pred, [0]),
                               [1, 0]).numpy())
    global_step = 0
    for i, prob in enumerate(attn_probs):
        for j in range(4):
            x = np.uint8(cm.viridis(prob.numpy()[j]) * 255)
            writer.add_image('Attention_%d_0' % global_step,
                             x,
                             i * 4 + j,
                             dataformats="HWC")

    writer.add_audio(text_input + '(cbhg)', wav, 0, cfg['audio']['sr'])

    if not os.path.exists(os.path.join(args.output, 'samples')):
        os.mkdir(os.path.join(args.output, 'samples'))
    write(os.path.join(os.path.join(args.output, 'samples'), 'cbhg.wav'),
          cfg['audio']['sr'], wav)

    # synthesis with griffin-lim
    wav = _ljspeech_processor.inv_melspectrogram(
        fluid.layers.transpose(fluid.layers.squeeze(postnet_pred, [0]),
                               [1, 0]).numpy())
    writer.add_audio(text_input + '(griffin)', wav, 0, cfg['audio']['sr'])

    write(os.path.join(os.path.join(args.output, 'samples'), 'griffin.wav'),
          cfg['audio']['sr'], wav)
    print("Synthesis completed !!!")
    writer.close()
Пример #18
0
def synthesis_with_clarinet(config_path, checkpoint, mel_spectrogram, place):
    with open(config_path, 'rt') as f:
        config = yaml.safe_load(f)

    data_config = config["data"]
    n_mels = data_config["n_mels"]

    teacher_config = config["teacher"]
    n_loop = teacher_config["n_loop"]
    n_layer = teacher_config["n_layer"]
    filter_size = teacher_config["filter_size"]

    # only batch=1 for validation is enabled

    with dg.guard(place):
        # conditioner(upsampling net)
        conditioner_config = config["conditioner"]
        upsampling_factors = conditioner_config["upsampling_factors"]
        upsample_net = UpsampleNet(upscale_factors=upsampling_factors)
        freeze(upsample_net)

        residual_channels = teacher_config["residual_channels"]
        loss_type = teacher_config["loss_type"]
        output_dim = teacher_config["output_dim"]
        log_scale_min = teacher_config["log_scale_min"]
        assert loss_type == "mog" and output_dim == 3, \
            "the teacher wavenet should be a wavenet with single gaussian output"

        teacher = WaveNet(n_loop, n_layer, residual_channels, output_dim,
                          n_mels, filter_size, loss_type, log_scale_min)
        # load & freeze upsample_net & teacher
        freeze(teacher)

        student_config = config["student"]
        n_loops = student_config["n_loops"]
        n_layers = student_config["n_layers"]
        student_residual_channels = student_config["residual_channels"]
        student_filter_size = student_config["filter_size"]
        student_log_scale_min = student_config["log_scale_min"]
        student = ParallelWaveNet(n_loops, n_layers, student_residual_channels,
                                  n_mels, student_filter_size)

        stft_config = config["stft"]
        stft = STFT(n_fft=stft_config["n_fft"],
                    hop_length=stft_config["hop_length"],
                    win_length=stft_config["win_length"])

        lmd = config["loss"]["lmd"]
        model = Clarinet(upsample_net, teacher, student, stft,
                         student_log_scale_min, lmd)
        io.load_parameters(model=model, checkpoint_path=checkpoint)

        if not os.path.exists(args.output):
            os.makedirs(args.output)
        model.eval()

        # Rescale mel_spectrogram.
        min_level, ref_level = 1e-5, 20  # hard code it
        mel_spectrogram = 20 * np.log10(np.maximum(min_level, mel_spectrogram))
        mel_spectrogram = mel_spectrogram - ref_level
        mel_spectrogram = np.clip((mel_spectrogram + 100) / 100, 0, 1)

        mel_spectrogram = dg.to_variable(mel_spectrogram)
        mel_spectrogram = fluid.layers.transpose(mel_spectrogram, [0, 2, 1])

        wav_var = model.synthesis(mel_spectrogram)
        wav_np = wav_var.numpy()[0]

        return wav_np
Пример #19
0
def synthesis(text_input, args):
    local_rank = dg.parallel.Env().local_rank
    place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    # tensorboard
    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = SummaryWriter(os.path.join(args.output, 'log'))

    fluid.enable_dygraph(place)
    with fluid.unique_name.guard():
        network_cfg = cfg['network']
        model = TransformerTTS(
            network_cfg['embedding_size'], network_cfg['hidden_size'],
            network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
            cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
            network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])
        # Load parameters.
        global_step = io.load_parameters(
            model=model, checkpoint_path=args.checkpoint_transformer)
        model.eval()

    # init input
    text = np.asarray(text_to_sequence(text_input))
    text = fluid.layers.unsqueeze(dg.to_variable(text).astype(np.int64), [0])
    mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32)
    pos_text = np.arange(1, text.shape[1] + 1)
    pos_text = fluid.layers.unsqueeze(
        dg.to_variable(pos_text).astype(np.int64), [0])

    for i in range(args.max_len):
        pos_mel = np.arange(1, mel_input.shape[1] + 1)
        pos_mel = fluid.layers.unsqueeze(
            dg.to_variable(pos_mel).astype(np.int64), [0])
        mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
            text, mel_input, pos_text, pos_mel)
        if stop_preds.numpy()[0, -1] > args.stop_threshold:
            break
        mel_input = fluid.layers.concat([mel_input, postnet_pred[:, -1:, :]],
                                        axis=1)
    global_step = 0
    for i, prob in enumerate(attn_probs):
        for j in range(4):
            x = np.uint8(cm.viridis(prob.numpy()[j]) * 255)
            writer.add_image('Attention_%d_0' % global_step,
                             x,
                             i * 4 + j,
                             dataformats="HWC")

    if args.vocoder == 'griffin-lim':
        #synthesis use griffin-lim
        wav = synthesis_with_griffinlim(postnet_pred, cfg['audio'])
    elif args.vocoder == 'waveflow':
        # synthesis use waveflow
        wav = synthesis_with_waveflow(postnet_pred, args,
                                      args.checkpoint_vocoder, place)
    else:
        print(
            'vocoder error, we only support griffinlim and waveflow, but recevied %s.'
            % args.vocoder)

    writer.add_audio(text_input + '(' + args.vocoder + ')', wav, 0,
                     cfg['audio']['sr'])
    if not os.path.exists(os.path.join(args.output, 'samples')):
        os.mkdir(os.path.join(args.output, 'samples'))
    write(
        os.path.join(os.path.join(args.output, 'samples'),
                     args.vocoder + '.wav'), cfg['audio']['sr'], wav)
    print("Synthesis completed !!!")
    writer.close()
Пример #20
0
def alignments(args):
    local_rank = dg.parallel.Env().local_rank
    place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    with dg.guard(place):
        network_cfg = cfg['network']
        model = TransformerTTS(
            network_cfg['embedding_size'], network_cfg['hidden_size'],
            network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
            cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
            network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])
        # Load parameters.
        global_step = io.load_parameters(
            model=model, checkpoint_path=args.checkpoint_transformer)
        model.eval()

        # get text data
        root = Path(args.data)
        csv_path = root.joinpath("metadata.csv")
        table = pd.read_csv(
            csv_path,
            sep="|",
            header=None,
            quoting=csv.QUOTE_NONE,
            names=["fname", "raw_text", "normalized_text"])

        pbar = tqdm(range(len(table)))
        alignments = OrderedDict()
        for i in pbar:
            fname, raw_text, normalized_text = table.iloc[i]
            # init input
            text = np.asarray(text_to_sequence(normalized_text))
            text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
            pos_text = np.arange(1, text.shape[1] + 1)
            pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])

            # load
            wav, _ = librosa.load(
                str(os.path.join(args.data, 'wavs', fname + ".wav")))

            spec = librosa.stft(
                y=wav,
                n_fft=cfg['audio']['n_fft'],
                win_length=cfg['audio']['win_length'],
                hop_length=cfg['audio']['hop_length'])
            mag = np.abs(spec)
            mel = librosa.filters.mel(sr=cfg['audio']['sr'],
                                      n_fft=cfg['audio']['n_fft'],
                                      n_mels=cfg['audio']['num_mels'],
                                      fmin=cfg['audio']['fmin'],
                                      fmax=cfg['audio']['fmax'])
            mel = np.matmul(mel, mag)
            mel = np.log(np.maximum(mel, 1e-5))

            mel_input = np.transpose(mel, axes=(1, 0))
            mel_input = fluid.layers.unsqueeze(dg.to_variable(mel_input), [0])
            mel_lens = mel_input.shape[1]

            pos_mel = np.arange(1, mel_input.shape[1] + 1)
            pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
            mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
                text, mel_input, pos_text, pos_mel)
            mel_input = fluid.layers.concat(
                [mel_input, postnet_pred[:, -1:, :]], axis=1)

            alignment, _ = get_alignment(attn_probs, mel_lens,
                                         network_cfg['decoder_num_head'])
            alignments[fname] = alignment
        with open(args.output + '.pkl', "wb") as f:
            pickle.dump(alignments, f)
Пример #21
0
def main(args):
    local_rank = dg.parallel.Env().local_rank
    nranks = dg.parallel.Env().nranks
    parallel = nranks > 1

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    global_step = 0
    place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()

    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = LogWriter(os.path.join(args.output,
                                    'log')) if local_rank == 0 else None

    fluid.enable_dygraph(place)
    network_cfg = cfg['network']
    model = TransformerTTS(
        network_cfg['embedding_size'], network_cfg['hidden_size'],
        network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
        cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
        network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])

    model.train()
    optimizer = fluid.optimizer.AdamOptimizer(
        learning_rate=dg.NoamDecay(1 / (cfg['train']['warm_up_step'] *
                                        (cfg['train']['learning_rate']**2)),
                                   cfg['train']['warm_up_step']),
        parameter_list=model.parameters(),
        grad_clip=fluid.clip.GradientClipByGlobalNorm(cfg['train'][
            'grad_clip_thresh']))

    # Load parameters.
    global_step = io.load_parameters(
        model=model,
        optimizer=optimizer,
        checkpoint_dir=os.path.join(args.output, 'checkpoints'),
        iteration=args.iteration,
        checkpoint_path=args.checkpoint)
    print("Rank {}: checkpoint loaded.".format(local_rank))

    if parallel:
        strategy = dg.parallel.prepare_context()
        model = fluid.dygraph.parallel.DataParallel(model, strategy)

    reader = LJSpeechLoader(
        cfg['audio'],
        place,
        args.data,
        cfg['train']['batch_size'],
        nranks,
        local_rank,
        shuffle=True).reader

    iterator = iter(tqdm(reader))

    global_step += 1

    while global_step <= cfg['train']['max_iteration']:
        try:
            batch = next(iterator)
        except StopIteration as e:
            iterator = iter(tqdm(reader))
            batch = next(iterator)

        character, mel, mel_input, pos_text, pos_mel, stop_tokens = batch

        mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
            character, mel_input, pos_text, pos_mel)

        mel_loss = layers.mean(
            layers.abs(layers.elementwise_sub(mel_pred, mel)))
        post_mel_loss = layers.mean(
            layers.abs(layers.elementwise_sub(postnet_pred, mel)))
        loss = mel_loss + post_mel_loss

        stop_loss = cross_entropy(
            stop_preds, stop_tokens, weight=cfg['network']['stop_loss_weight'])
        loss = loss + stop_loss

        if local_rank == 0:
            writer.add_scalar('training_loss/mel_loss',
                              mel_loss.numpy(),
                              global_step)
            writer.add_scalar('training_loss/post_mel_loss',
                              post_mel_loss.numpy(),
                              global_step)
            writer.add_scalar('stop_loss', stop_loss.numpy(), global_step)

            if parallel:
                writer.add_scalar('alphas/encoder_alpha',
                                   model._layers.encoder.alpha.numpy(),
                                   global_step)
                writer.add_scalar('alphas/decoder_alpha',
                                   model._layers.decoder.alpha.numpy(),
                                   global_step)
            else:
                writer.add_scalar('alphas/encoder_alpha',
                                   model.encoder.alpha.numpy(),
                                   global_step)
                writer.add_scalar('alphas/decoder_alpha',
                                   model.decoder.alpha.numpy(),
                                   global_step)

            writer.add_scalar('learning_rate',
                              optimizer._learning_rate.step().numpy(),
                              global_step)

            if global_step % cfg['train']['image_interval'] == 1:
                for i, prob in enumerate(attn_probs):
                    for j in range(cfg['network']['decoder_num_head']):
                        x = np.uint8(
                            cm.viridis(prob.numpy()[j * cfg['train'][
                                'batch_size'] // nranks]) * 255)
                        writer.add_image(
                            'Attention_%d_0' % global_step,
                            x,
                            i * 4 + j)

                for i, prob in enumerate(attn_enc):
                    for j in range(cfg['network']['encoder_num_head']):
                        x = np.uint8(
                            cm.viridis(prob.numpy()[j * cfg['train'][
                                'batch_size'] // nranks]) * 255)
                        writer.add_image(
                            'Attention_enc_%d_0' % global_step,
                            x,
                            i * 4 + j)

                for i, prob in enumerate(attn_dec):
                    for j in range(cfg['network']['decoder_num_head']):
                        x = np.uint8(
                            cm.viridis(prob.numpy()[j * cfg['train'][
                                'batch_size'] // nranks]) * 255)
                        writer.add_image(
                            'Attention_dec_%d_0' % global_step,
                            x,
                            i * 4 + j)

        if parallel:
            loss = model.scale_loss(loss)
            loss.backward()
            model.apply_collective_grads()
        else:
            loss.backward()
        optimizer.minimize(loss)
        model.clear_gradients()

        # save checkpoint
        if local_rank == 0 and global_step % cfg['train'][
                'checkpoint_interval'] == 0:
            io.save_parameters(
                os.path.join(args.output, 'checkpoints'), global_step, model,
                optimizer)
        global_step += 1

    if local_rank == 0:
        writer.close()
Пример #22
0
def main(args):
    local_rank = dg.parallel.Env().local_rank
    nranks = dg.parallel.Env().nranks
    parallel = nranks > 1

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    global_step = 0
    place = fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace()

    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = SummaryWriter(os.path.join(args.output,
                                        'log')) if local_rank == 0 else None

    fluid.enable_dygraph(place)
    network_cfg = cfg['network']
    model = TransformerTTS(
        network_cfg['embedding_size'], network_cfg['hidden_size'],
        network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
        cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
        network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])

    model.train()
    optimizer = fluid.optimizer.AdamOptimizer(
        learning_rate=dg.NoamDecay(
            1 / (cfg['train']['warm_up_step'] *
                 (cfg['train']['learning_rate']**2)),
            cfg['train']['warm_up_step']),
        parameter_list=model.parameters(),
        grad_clip=fluid.clip.GradientClipByGlobalNorm(
            cfg['train']['grad_clip_thresh']))

    # Load parameters.
    global_step = io.load_parameters(model=model,
                                     optimizer=optimizer,
                                     checkpoint_dir=os.path.join(
                                         args.output, 'checkpoints'),
                                     iteration=args.iteration,
                                     checkpoint_path=args.checkpoint)
    print("Rank {}: checkpoint loaded.".format(local_rank))

    if parallel:
        strategy = dg.parallel.prepare_context()
        model = fluid.dygraph.parallel.DataParallel(model, strategy)

    reader = LJSpeechLoader(cfg['audio'],
                            place,
                            args.data,
                            cfg['train']['batch_size'],
                            nranks,
                            local_rank,
                            shuffle=True).reader()

    for epoch in range(cfg['train']['max_epochs']):
        pbar = tqdm(reader)
        for i, data in enumerate(pbar):
            pbar.set_description('Processing at epoch %d' % epoch)
            character, mel, mel_input, pos_text, pos_mel = data

            global_step += 1

            mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
                character, mel_input, pos_text, pos_mel)

            mel_loss = layers.mean(
                layers.abs(layers.elementwise_sub(mel_pred, mel)))
            post_mel_loss = layers.mean(
                layers.abs(layers.elementwise_sub(postnet_pred, mel)))
            loss = mel_loss + post_mel_loss

            # Note: When used stop token loss the learning did not work.
            if cfg['network']['stop_token']:
                label = (pos_mel == 0).astype(np.float32)
                stop_loss = cross_entropy(stop_preds, label)
                loss = loss + stop_loss

            if local_rank == 0:
                writer.add_scalars(
                    'training_loss', {
                        'mel_loss': mel_loss.numpy(),
                        'post_mel_loss': post_mel_loss.numpy()
                    }, global_step)

                if cfg['network']['stop_token']:
                    writer.add_scalar('stop_loss', stop_loss.numpy(),
                                      global_step)

                if parallel:
                    writer.add_scalars(
                        'alphas', {
                            'encoder_alpha':
                            model._layers.encoder.alpha.numpy(),
                            'decoder_alpha':
                            model._layers.decoder.alpha.numpy(),
                        }, global_step)
                else:
                    writer.add_scalars(
                        'alphas', {
                            'encoder_alpha': model.encoder.alpha.numpy(),
                            'decoder_alpha': model.decoder.alpha.numpy(),
                        }, global_step)

                writer.add_scalar('learning_rate',
                                  optimizer._learning_rate.step().numpy(),
                                  global_step)

                if global_step % cfg['train']['image_interval'] == 1:
                    for i, prob in enumerate(attn_probs):
                        for j in range(cfg['network']['decoder_num_head']):
                            x = np.uint8(
                                cm.viridis(prob.numpy()[
                                    j * cfg['train']['batch_size'] // 2]) *
                                255)
                            writer.add_image('Attention_%d_0' % global_step,
                                             x,
                                             i * 4 + j,
                                             dataformats="HWC")

                    for i, prob in enumerate(attn_enc):
                        for j in range(cfg['network']['encoder_num_head']):
                            x = np.uint8(
                                cm.viridis(prob.numpy()[
                                    j * cfg['train']['batch_size'] // 2]) *
                                255)
                            writer.add_image('Attention_enc_%d_0' %
                                             global_step,
                                             x,
                                             i * 4 + j,
                                             dataformats="HWC")

                    for i, prob in enumerate(attn_dec):
                        for j in range(cfg['network']['decoder_num_head']):
                            x = np.uint8(
                                cm.viridis(prob.numpy()[
                                    j * cfg['train']['batch_size'] // 2]) *
                                255)
                            writer.add_image('Attention_dec_%d_0' %
                                             global_step,
                                             x,
                                             i * 4 + j,
                                             dataformats="HWC")

            if parallel:
                loss = model.scale_loss(loss)
                loss.backward()
                model.apply_collective_grads()
            else:
                loss.backward()
            optimizer.minimize(loss)
            model.clear_gradients()

            # save checkpoint
            if local_rank == 0 and global_step % cfg['train'][
                    'checkpoint_interval'] == 0:
                io.save_parameters(os.path.join(args.output, 'checkpoints'),
                                   global_step, model, optimizer)

    if local_rank == 0:
        writer.close()