Пример #1
0
    def process_a_sentence(self, model, text):
        text = np.array(
            en.text_to_sequence(
                text, p=self.p_replace), dtype=np.int64)
        length = len(text)
        text_positions = np.arange(1, 1 + length)
        text = np.expand_dims(text, 0)
        text_positions = np.expand_dims(text_positions, 0)

        model.eval()
        if isinstance(model, dg.DataParallel):
            _model = model._layers
        else:
            _model = model
        mel_outputs, linear_outputs, alignments, done = _model.transduce(
            dg.to_variable(text), dg.to_variable(text_positions))

        linear_outputs_np = linear_outputs.numpy()[0].T  # (C, T)

        wav = spec_to_waveform(linear_outputs_np, self.min_level_db,
                               self.ref_level_db, self.power, self.n_iter,
                               self.win_length, self.hop_length,
                               self.preemphasis)
        alignments_np = alignments.numpy()[0]  # batch_size = 1
        return wav, alignments_np
Пример #2
0
    def __call__(self, examples):
        """
        output shape and dtype
        (B, T_text) int64
        (B,) int64
        (B, T_frame, C_spec) float32
        (B, T_frame, C_mel) float32
        (B,) int64
        """
        text_seqs = []
        specs = []
        mels = []
        num_frames = np.array([example[3] for example in examples],
                              dtype=np.int64)
        max_frames = np.max(num_frames)

        for example in examples:
            text, spec, mel, _ = example
            text_seqs.append(en.text_to_sequence(text, self.p_pronunciation))
            # if max_frames - mel.shape[0] < 0:
            #     import pdb; pdb.set_trace()
            specs.append(
                np.pad(spec, [(0, max_frames - spec.shape[0]), (0, 0)]))
            mels.append(np.pad(mel, [(0, max_frames - mel.shape[0]), (0, 0)]))

        specs = np.stack(specs)
        mels = np.stack(mels)

        text_lengths = np.array([len(seq) for seq in text_seqs],
                                dtype=np.int64)
        max_length = np.max(text_lengths)
        text_seqs = np.array(
            [seq + [0] * (max_length - len(seq)) for seq in text_seqs],
            dtype=np.int64)
        return text_seqs, text_lengths, specs, mels, num_frames
Пример #3
0
def synthesis(text_input, args):
    local_rank = dg.parallel.Env().local_rank
    place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())
    fluid.enable_dygraph(place)

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    # tensorboard
    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = SummaryWriter(os.path.join(args.output, 'log'))

    model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels'])
    # Load parameters.
    global_step = io.load_parameters(model=model,
                                     checkpoint_path=args.checkpoint)
    model.eval()

    text = np.asarray(text_to_sequence(text_input))
    text = np.expand_dims(text, axis=0)
    pos_text = np.arange(1, text.shape[1] + 1)
    pos_text = np.expand_dims(pos_text, axis=0)

    text = dg.to_variable(text)
    pos_text = dg.to_variable(pos_text)

    _, mel_output_postnet = model(text, pos_text, alpha=args.alpha)

    result = np.exp(mel_output_postnet.numpy())
    mel_output_postnet = fluid.layers.transpose(
        fluid.layers.squeeze(mel_output_postnet, [0]), [1, 0])
    mel_output_postnet = np.exp(mel_output_postnet.numpy())
    basis = librosa.filters.mel(cfg['audio']['sr'], cfg['audio']['n_fft'],
                                cfg['audio']['num_mels'])
    inv_basis = np.linalg.pinv(basis)
    spec = np.maximum(1e-10, np.dot(inv_basis, mel_output_postnet))

    # synthesis use clarinet
    wav_clarinet = synthesis_with_clarinet(args.config_clarinet,
                                           args.checkpoint_clarinet, result,
                                           place)
    writer.add_audio(text_input + '(clarinet)', wav_clarinet, 0,
                     cfg['audio']['sr'])
    if not os.path.exists(os.path.join(args.output, 'samples')):
        os.mkdir(os.path.join(args.output, 'samples'))
    write(os.path.join(os.path.join(args.output, 'samples'), 'clarinet.wav'),
          cfg['audio']['sr'], wav_clarinet)

    #synthesis use griffin-lim
    wav = librosa.core.griffinlim(spec**cfg['audio']['power'],
                                  hop_length=cfg['audio']['hop_length'],
                                  win_length=cfg['audio']['win_length'])
    writer.add_audio(text_input + '(griffin-lim)', wav, 0, cfg['audio']['sr'])
    write(
        os.path.join(os.path.join(args.output, 'samples'), 'grinffin-lim.wav'),
        cfg['audio']['sr'], wav)
    print("Synthesis completed !!!")
    writer.close()
Пример #4
0
def synthesize(args, config, model, vocoder, sentence, monotonic_layers):
    print("[synthesize] {}".format(sentence))
    text = en.text_to_sequence(sentence, p=1.0)
    text = np.expand_dims(np.array(text, dtype="int64"), 0)
    lengths = np.array([text.size], dtype=np.int64)
    text_seqs = dg.to_variable(text)
    text_lengths = dg.to_variable(lengths)

    decoder_layers = config["decoder_layers"]
    force_monotonic_attention = [False] * decoder_layers
    for i in monotonic_layers:
        force_monotonic_attention[i] = True

    with dg.no_grad():
        outputs = model(text_seqs,
                        text_lengths,
                        speakers=None,
                        force_monotonic_attention=force_monotonic_attention,
                        window=(config["backward_step"],
                                config["forward_step"]))
        decoded, refined, attentions = outputs
        if args.vocoder == "griffin-lim":
            wav_np = vocoder(refined.numpy()[0].T)
        else:
            wav = vocoder(F.transpose(refined, (0, 2, 1)))
            wav_np = wav.numpy()[0]
    return wav_np
Пример #5
0
def synthesis(text_input, args):
    local_rank = dg.parallel.Env().local_rank
    place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())
    fluid.enable_dygraph(place)

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    # tensorboard
    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = SummaryWriter(os.path.join(args.output, 'log'))

    model = FastSpeech(cfg['network'], num_mels=cfg['audio']['num_mels'])
    # Load parameters.
    global_step = io.load_parameters(
        model=model, checkpoint_path=args.checkpoint)
    model.eval()

    text = np.asarray(text_to_sequence(text_input))
    text = np.expand_dims(text, axis=0)
    pos_text = np.arange(1, text.shape[1] + 1)
    pos_text = np.expand_dims(pos_text, axis=0)

    text = dg.to_variable(text).astype(np.int64)
    pos_text = dg.to_variable(pos_text).astype(np.int64)

    _, mel_output_postnet = model(text, pos_text, alpha=args.alpha)

    if args.vocoder == 'griffin-lim':
        #synthesis use griffin-lim
        wav = synthesis_with_griffinlim(mel_output_postnet, cfg['audio'])
    elif args.vocoder == 'waveflow':
        wav = synthesis_with_waveflow(mel_output_postnet, args,
                                      args.checkpoint_vocoder, place)
    else:
        print(
            'vocoder error, we only support griffinlim and waveflow, but recevied %s.'
            % args.vocoder)

    writer.add_audio(text_input + '(' + args.vocoder + ')', wav, 0,
                     cfg['audio']['sr'])
    if not os.path.exists(os.path.join(args.output, 'samples')):
        os.mkdir(os.path.join(args.output, 'samples'))
    write(
        os.path.join(
            os.path.join(args.output, 'samples'), args.vocoder + '.wav'),
        cfg['audio']['sr'], wav)
    print("Synthesis completed !!!")
    writer.close()
Пример #6
0
    def __call__(self, in_data):
        fname, _, normalized_text = in_data

        # text processing
        mix_grapheme_phonemes = text_to_sequence(
            normalized_text, self.replace_pronounciation_prob)
        text_length = len(mix_grapheme_phonemes)
        # CAUTION: positions start from 1
        speaker_id = None

        # wave processing
        wav, _ = librosa.load(fname, sr=self.sample_rate)
        # preemphasis
        y = signal.lfilter([1., -self.preemphasis], [1.], wav)

        # STFT
        D = librosa.stft(y=y,
                         n_fft=self.n_fft,
                         win_length=self.win_length,
                         hop_length=self.hop_length)
        S = np.abs(D)

        # to db and normalize to 0-1
        amplitude_min = np.exp(self.min_level_db / 20 * np.log(10))  # 1e-5
        S_norm = 20 * np.log10(np.maximum(amplitude_min,
                                          S)) - self.ref_level_db
        S_norm = (S_norm - self.min_level_db) / (-self.min_level_db)
        S_norm = self.max_norm * S_norm
        if self.clip_norm:
            S_norm = np.clip(S_norm, 0, self.max_norm)

        # mel scale and to db and normalize to 0-1,
        # CAUTION: pass linear scale S, not dbscaled S
        S_mel = librosa.feature.melspectrogram(S=S,
                                               n_mels=self.n_mels,
                                               fmin=self.fmin,
                                               fmax=self.fmax,
                                               power=1.)
        S_mel = 20 * np.log10(np.maximum(amplitude_min,
                                         S_mel)) - self.ref_level_db
        S_mel_norm = (S_mel - self.min_level_db) / (-self.min_level_db)
        S_mel_norm = self.max_norm * S_mel_norm
        if self.clip_norm:
            S_mel_norm = np.clip(S_mel_norm, 0, self.max_norm)

        # num_frames
        n_frames = S_mel_norm.shape[-1]  # CAUTION: original number of frames
        return (mix_grapheme_phonemes, text_length, speaker_id, S_norm,
                S_mel_norm, n_frames)
Пример #7
0
def eval_model(model, text, replace_pronounciation_prob, min_level_db,
               ref_level_db, power, n_iter, win_length, hop_length,
               preemphasis):
    """generate waveform from text using a deepvoice 3 model"""
    text = np.array(en.text_to_sequence(text, p=replace_pronounciation_prob),
                    dtype=np.int64)
    length = len(text)
    print("text sequence's length: {}".format(length))
    text_positions = np.arange(1, 1 + length)

    text = np.expand_dims(text, 0)
    text_positions = np.expand_dims(text_positions, 0)
    model.eval()
    mel_outputs, linear_outputs, alignments, done = model.transduce(
        dg.to_variable(text), dg.to_variable(text_positions))

    linear_outputs_np = linear_outputs.numpy()[0].T  # (C, T)
    wav = spec_to_waveform(linear_outputs_np, min_level_db, ref_level_db,
                           power, n_iter, win_length, hop_length, preemphasis)
    alignments_np = alignments.numpy()[0]  # batch_size = 1
    print("linear_outputs's shape: ", linear_outputs_np.shape)
    print("alignmnets' shape:", alignments.shape)
    return wav, alignments_np
Пример #8
0
def alignments(args):
    local_rank = dg.parallel.Env().local_rank
    place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    with dg.guard(place):
        network_cfg = cfg['network']
        model = TransformerTTS(
            network_cfg['embedding_size'], network_cfg['hidden_size'],
            network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
            cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
            network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])
        # Load parameters.
        global_step = io.load_parameters(
            model=model, checkpoint_path=args.checkpoint_transformer)
        model.eval()

        # get text data
        root = Path(args.data)
        csv_path = root.joinpath("metadata.csv")
        table = pd.read_csv(csv_path,
                            sep="|",
                            header=None,
                            quoting=csv.QUOTE_NONE,
                            names=["fname", "raw_text", "normalized_text"])
        ljspeech_processor = audio.AudioProcessor(
            sample_rate=cfg['audio']['sr'],
            num_mels=cfg['audio']['num_mels'],
            min_level_db=cfg['audio']['min_level_db'],
            ref_level_db=cfg['audio']['ref_level_db'],
            n_fft=cfg['audio']['n_fft'],
            win_length=cfg['audio']['win_length'],
            hop_length=cfg['audio']['hop_length'],
            power=cfg['audio']['power'],
            preemphasis=cfg['audio']['preemphasis'],
            signal_norm=True,
            symmetric_norm=False,
            max_norm=1.,
            mel_fmin=0,
            mel_fmax=None,
            clip_norm=True,
            griffin_lim_iters=60,
            do_trim_silence=False,
            sound_norm=False)

        pbar = tqdm(range(len(table)))
        alignments = OrderedDict()
        for i in pbar:
            fname, raw_text, normalized_text = table.iloc[i]
            # init input
            text = np.asarray(text_to_sequence(normalized_text))
            text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
            pos_text = np.arange(1, text.shape[1] + 1)
            pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
            wav = ljspeech_processor.load_wav(
                os.path.join(args.data, 'wavs', fname + ".wav"))
            mel_input = ljspeech_processor.melspectrogram(wav).astype(
                np.float32)
            mel_input = np.transpose(mel_input, axes=(1, 0))
            mel_input = fluid.layers.unsqueeze(dg.to_variable(mel_input), [0])
            mel_lens = mel_input.shape[1]

            dec_slf_mask = get_triu_tensor(mel_input,
                                           mel_input).astype(np.float32)
            dec_slf_mask = np.expand_dims(dec_slf_mask, axis=0)
            dec_slf_mask = fluid.layers.cast(dg.to_variable(dec_slf_mask != 0),
                                             np.float32) * (-2**32 + 1)
            pos_mel = np.arange(1, mel_input.shape[1] + 1)
            pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
            mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
                text, mel_input, pos_text, pos_mel, dec_slf_mask)
            mel_input = fluid.layers.concat(
                [mel_input, postnet_pred[:, -1:, :]], axis=1)

            alignment, _ = get_alignment(attn_probs, mel_lens,
                                         network_cfg['decoder_num_head'])
            alignments[fname] = alignment
        with open(args.output + '.txt', "wb") as f:
            pickle.dump(alignments, f)
Пример #9
0
 def _get_example(self, metadatum):
     wave_file, speaker, text = metadatum
     wav_path = self.wav_root.joinpath(speaker, wave_file)
     wav, sr = librosa.load(str(wav_path), sr=None)
     phoneme_seq = np.array(text_to_sequence(text))
     return wav, self.speaker_indices[speaker], phoneme_seq
Пример #10
0
def synthesis(text_input, args):
    place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())

    with open(args.config_path) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    # tensorboard
    if not os.path.exists(args.log_dir):
        os.mkdir(args.log_dir)
    path = os.path.join(args.log_dir, 'synthesis')

    writer = SummaryWriter(path)

    with dg.guard(place):
        with fluid.unique_name.guard():
            model = TransformerTTS(cfg)
            model.set_dict(
                load_checkpoint(
                    str(args.transformer_step),
                    os.path.join(args.checkpoint_path, "transformer")))
            model.eval()

        with fluid.unique_name.guard():
            model_vocoder = Vocoder(cfg, args.batch_size)
            model_vocoder.set_dict(
                load_checkpoint(
                    str(args.vocoder_step),
                    os.path.join(args.checkpoint_path, "vocoder")))
            model_vocoder.eval()
        # init input
        text = np.asarray(text_to_sequence(text_input))
        text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
        mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32)
        pos_text = np.arange(1, text.shape[1] + 1)
        pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])

        pbar = tqdm(range(args.max_len))
        for i in pbar:
            dec_slf_mask = get_triu_tensor(
                mel_input.numpy(), mel_input.numpy()).astype(np.float32)
            dec_slf_mask = fluid.layers.cast(
                dg.to_variable(dec_slf_mask != 0), np.float32) * (-2**32 + 1)
            pos_mel = np.arange(1, mel_input.shape[1] + 1)
            pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
            mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
                text, mel_input, pos_text, pos_mel, dec_slf_mask)
            mel_input = fluid.layers.concat(
                [mel_input, postnet_pred[:, -1:, :]], axis=1)

        mag_pred = model_vocoder(postnet_pred)

        _ljspeech_processor = audio.AudioProcessor(
            sample_rate=cfg['audio']['sr'],
            num_mels=cfg['audio']['num_mels'],
            min_level_db=cfg['audio']['min_level_db'],
            ref_level_db=cfg['audio']['ref_level_db'],
            n_fft=cfg['audio']['n_fft'],
            win_length=cfg['audio']['win_length'],
            hop_length=cfg['audio']['hop_length'],
            power=cfg['audio']['power'],
            preemphasis=cfg['audio']['preemphasis'],
            signal_norm=True,
            symmetric_norm=False,
            max_norm=1.,
            mel_fmin=0,
            mel_fmax=None,
            clip_norm=True,
            griffin_lim_iters=60,
            do_trim_silence=False,
            sound_norm=False)

        wav = _ljspeech_processor.inv_spectrogram(
            fluid.layers.transpose(
                fluid.layers.squeeze(mag_pred, [0]), [1, 0]).numpy())
        global_step = 0
        for i, prob in enumerate(attn_probs):
            for j in range(4):
                x = np.uint8(cm.viridis(prob.numpy()[j]) * 255)
                writer.add_image(
                    'Attention_%d_0' % global_step,
                    x,
                    i * 4 + j,
                    dataformats="HWC")

        for i, prob in enumerate(attn_enc):
            for j in range(4):
                x = np.uint8(cm.viridis(prob.numpy()[j]) * 255)
                writer.add_image(
                    'Attention_enc_%d_0' % global_step,
                    x,
                    i * 4 + j,
                    dataformats="HWC")

        for i, prob in enumerate(attn_dec):
            for j in range(4):
                x = np.uint8(cm.viridis(prob.numpy()[j]) * 255)
                writer.add_image(
                    'Attention_dec_%d_0' % global_step,
                    x,
                    i * 4 + j,
                    dataformats="HWC")
        writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
        if not os.path.exists(args.sample_path):
            os.mkdir(args.sample_path)
        write(
            os.path.join(args.sample_path, 'test.wav'), cfg['audio']['sr'],
            wav)
    writer.close()
Пример #11
0
    def synthesize(self, texts, use_gpu=False, vocoder="griffin-lim"):
        """
        Get the synthetic wavs from the texts.

        Args:
             texts(list): the input texts to be predicted.
             use_gpu(bool): whether use gpu to predict or not
             vocoder(str): the vocoder name, "griffin-lim" or "waveflow"

        Returns:
             wavs(str): the audio wav with sample rate . You can use soundfile.write to save it.
             sample_rate(int): the audio sample rate.
        """
        if use_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
            use_gpu = False
            logger.warning(
                "use_gpu has been set False as you didn't set the environment variable CUDA_VISIBLE_DEVICES while using use_gpu=True"
            )

        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()

        if texts and isinstance(texts, list):
            predicted_data = texts
        else:
            raise ValueError(
                "The input data is inconsistent with expectations.")

        wavs = []
        with fluid.dygraph.guard(place):
            self.tts_model.eval()
            self.waveflow.model.eval()
            monotonic_layers = [4]
            for text in predicted_data:
                # init input
                logger.info("Processing sentence: %s" % text)
                text = en.text_to_sequence(text, p=1.0)
                text = np.expand_dims(np.array(text, dtype="int64"), 0)
                lengths = np.array([text.size], dtype=np.int64)
                text_seqs = dg.to_variable(text)
                text_lengths = dg.to_variable(lengths)

                decoder_layers = self.tts_config["decoder_layers"]
                force_monotonic_attention = [False] * decoder_layers
                for i in monotonic_layers:
                    force_monotonic_attention[i] = True

                outputs = self.tts_model(
                    text_seqs,
                    text_lengths,
                    speakers=None,
                    force_monotonic_attention=force_monotonic_attention,
                    window=(self.tts_config["backward_step"],
                            self.tts_config["forward_step"]))
                decoded, refined, attentions = outputs
                if vocoder == 'griffin-lim':
                    # synthesis use griffin-lim
                    wav = self.griffin(refined.numpy()[0].T)
                elif vocoder == 'waveflow':
                    # synthesis use waveflow
                    wav = self.waveflow(
                        fluid.layers.transpose(refined, [0, 2, 1])).numpy()[0]
                else:
                    raise ValueError(
                        'vocoder error, we only support griffinlim and waveflow, but recevied %s.'
                        % vocoder)
                wavs.append(wav)
        return wavs, self.tts_config["sample_rate"]
Пример #12
0
def synthesis(text_input, args):
    local_rank = dg.parallel.Env().local_rank
    place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    # tensorboard
    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = SummaryWriter(os.path.join(args.output, 'log'))

    fluid.enable_dygraph(place)
    with fluid.unique_name.guard():
        network_cfg = cfg['network']
        model = TransformerTTS(
            network_cfg['embedding_size'], network_cfg['hidden_size'],
            network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
            cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
            network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])
        # Load parameters.
        global_step = io.load_parameters(
            model=model, checkpoint_path=args.checkpoint_transformer)
        model.eval()

    with fluid.unique_name.guard():
        model_vocoder = Vocoder(cfg['train']['batch_size'],
                                cfg['vocoder']['hidden_size'],
                                cfg['audio']['num_mels'],
                                cfg['audio']['n_fft'])
        # Load parameters.
        global_step = io.load_parameters(
            model=model_vocoder, checkpoint_path=args.checkpoint_vocoder)
        model_vocoder.eval()
    # init input
    text = np.asarray(text_to_sequence(text_input))
    text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
    mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32)
    pos_text = np.arange(1, text.shape[1] + 1)
    pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])

    pbar = tqdm(range(args.max_len))
    for i in pbar:
        pos_mel = np.arange(1, mel_input.shape[1] + 1)
        pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
        mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
            text, mel_input, pos_text, pos_mel)
        mel_input = fluid.layers.concat([mel_input, postnet_pred[:, -1:, :]],
                                        axis=1)

    mag_pred = model_vocoder(postnet_pred)

    _ljspeech_processor = audio.AudioProcessor(
        sample_rate=cfg['audio']['sr'],
        num_mels=cfg['audio']['num_mels'],
        min_level_db=cfg['audio']['min_level_db'],
        ref_level_db=cfg['audio']['ref_level_db'],
        n_fft=cfg['audio']['n_fft'],
        win_length=cfg['audio']['win_length'],
        hop_length=cfg['audio']['hop_length'],
        power=cfg['audio']['power'],
        preemphasis=cfg['audio']['preemphasis'],
        signal_norm=True,
        symmetric_norm=False,
        max_norm=1.,
        mel_fmin=0,
        mel_fmax=None,
        clip_norm=True,
        griffin_lim_iters=60,
        do_trim_silence=False,
        sound_norm=False)

    # synthesis with cbhg
    wav = _ljspeech_processor.inv_spectrogram(
        fluid.layers.transpose(fluid.layers.squeeze(mag_pred, [0]),
                               [1, 0]).numpy())
    global_step = 0
    for i, prob in enumerate(attn_probs):
        for j in range(4):
            x = np.uint8(cm.viridis(prob.numpy()[j]) * 255)
            writer.add_image('Attention_%d_0' % global_step,
                             x,
                             i * 4 + j,
                             dataformats="HWC")

    writer.add_audio(text_input + '(cbhg)', wav, 0, cfg['audio']['sr'])

    if not os.path.exists(os.path.join(args.output, 'samples')):
        os.mkdir(os.path.join(args.output, 'samples'))
    write(os.path.join(os.path.join(args.output, 'samples'), 'cbhg.wav'),
          cfg['audio']['sr'], wav)

    # synthesis with griffin-lim
    wav = _ljspeech_processor.inv_melspectrogram(
        fluid.layers.transpose(fluid.layers.squeeze(postnet_pred, [0]),
                               [1, 0]).numpy())
    writer.add_audio(text_input + '(griffin)', wav, 0, cfg['audio']['sr'])

    write(os.path.join(os.path.join(args.output, 'samples'), 'griffin.wav'),
          cfg['audio']['sr'], wav)
    print("Synthesis completed !!!")
    writer.close()
Пример #13
0
def synthesis(text_input, args):
    place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())

    # tensorboard
    if not os.path.exists(args.log_dir):
        os.mkdir(args.log_dir)
    path = os.path.join(args.log_dir, 'synthesis')

    with open(args.config_path) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    writer = SummaryWriter(path)

    with dg.guard(place):
        model = FastSpeech(cfg)
        model.set_dict(
            load_checkpoint(
                str(args.fastspeech_step),
                os.path.join(args.checkpoint_path, "fastspeech")))
        model.eval()

        text = np.asarray(text_to_sequence(text_input))
        text = np.expand_dims(text, axis=0)
        pos_text = np.arange(1, text.shape[1] + 1)
        pos_text = np.expand_dims(pos_text, axis=0)
        enc_non_pad_mask = get_non_pad_mask(pos_text).astype(np.float32)
        enc_slf_attn_mask = get_attn_key_pad_mask(pos_text,
                                                  text).astype(np.float32)

        text = dg.to_variable(text)
        pos_text = dg.to_variable(pos_text)
        enc_non_pad_mask = dg.to_variable(enc_non_pad_mask)
        enc_slf_attn_mask = dg.to_variable(enc_slf_attn_mask)

        mel_output, mel_output_postnet = model(
            text,
            pos_text,
            alpha=args.alpha,
            enc_non_pad_mask=enc_non_pad_mask,
            enc_slf_attn_mask=enc_slf_attn_mask,
            dec_non_pad_mask=None,
            dec_slf_attn_mask=None)

        _ljspeech_processor = audio.AudioProcessor(
            sample_rate=cfg['audio']['sr'],
            num_mels=cfg['audio']['num_mels'],
            min_level_db=cfg['audio']['min_level_db'],
            ref_level_db=cfg['audio']['ref_level_db'],
            n_fft=cfg['audio']['n_fft'],
            win_length=cfg['audio']['win_length'],
            hop_length=cfg['audio']['hop_length'],
            power=cfg['audio']['power'],
            preemphasis=cfg['audio']['preemphasis'],
            signal_norm=True,
            symmetric_norm=False,
            max_norm=1.,
            mel_fmin=0,
            mel_fmax=None,
            clip_norm=True,
            griffin_lim_iters=60,
            do_trim_silence=False,
            sound_norm=False)

        mel_output_postnet = fluid.layers.transpose(
            fluid.layers.squeeze(mel_output_postnet, [0]), [1, 0])
        wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy(
        ))
        writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
        print("Synthesis completed !!!")
    writer.close()
Пример #14
0
def synthesis(text_input, args):
    local_rank = dg.parallel.Env().local_rank
    place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    # tensorboard
    if not os.path.exists(args.output):
        os.mkdir(args.output)

    writer = SummaryWriter(os.path.join(args.output, 'log'))

    fluid.enable_dygraph(place)
    with fluid.unique_name.guard():
        network_cfg = cfg['network']
        model = TransformerTTS(
            network_cfg['embedding_size'], network_cfg['hidden_size'],
            network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
            cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
            network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])
        # Load parameters.
        global_step = io.load_parameters(
            model=model, checkpoint_path=args.checkpoint_transformer)
        model.eval()

    # init input
    text = np.asarray(text_to_sequence(text_input))
    text = fluid.layers.unsqueeze(dg.to_variable(text).astype(np.int64), [0])
    mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32)
    pos_text = np.arange(1, text.shape[1] + 1)
    pos_text = fluid.layers.unsqueeze(
        dg.to_variable(pos_text).astype(np.int64), [0])

    for i in range(args.max_len):
        pos_mel = np.arange(1, mel_input.shape[1] + 1)
        pos_mel = fluid.layers.unsqueeze(
            dg.to_variable(pos_mel).astype(np.int64), [0])
        mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
            text, mel_input, pos_text, pos_mel)
        if stop_preds.numpy()[0, -1] > args.stop_threshold:
            break
        mel_input = fluid.layers.concat([mel_input, postnet_pred[:, -1:, :]],
                                        axis=1)
    global_step = 0
    for i, prob in enumerate(attn_probs):
        for j in range(4):
            x = np.uint8(cm.viridis(prob.numpy()[j]) * 255)
            writer.add_image('Attention_%d_0' % global_step,
                             x,
                             i * 4 + j,
                             dataformats="HWC")

    if args.vocoder == 'griffin-lim':
        #synthesis use griffin-lim
        wav = synthesis_with_griffinlim(postnet_pred, cfg['audio'])
    elif args.vocoder == 'waveflow':
        # synthesis use waveflow
        wav = synthesis_with_waveflow(postnet_pred, args,
                                      args.checkpoint_vocoder, place)
    else:
        print(
            'vocoder error, we only support griffinlim and waveflow, but recevied %s.'
            % args.vocoder)

    writer.add_audio(text_input + '(' + args.vocoder + ')', wav, 0,
                     cfg['audio']['sr'])
    if not os.path.exists(os.path.join(args.output, 'samples')):
        os.mkdir(os.path.join(args.output, 'samples'))
    write(
        os.path.join(os.path.join(args.output, 'samples'),
                     args.vocoder + '.wav'), cfg['audio']['sr'], wav)
    print("Synthesis completed !!!")
    writer.close()
Пример #15
0
def alignments(args):
    local_rank = dg.parallel.Env().local_rank
    place = (fluid.CUDAPlace(local_rank) if args.use_gpu else fluid.CPUPlace())

    with open(args.config) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)

    with dg.guard(place):
        network_cfg = cfg['network']
        model = TransformerTTS(
            network_cfg['embedding_size'], network_cfg['hidden_size'],
            network_cfg['encoder_num_head'], network_cfg['encoder_n_layers'],
            cfg['audio']['num_mels'], network_cfg['outputs_per_step'],
            network_cfg['decoder_num_head'], network_cfg['decoder_n_layers'])
        # Load parameters.
        global_step = io.load_parameters(
            model=model, checkpoint_path=args.checkpoint_transformer)
        model.eval()

        # get text data
        root = Path(args.data)
        csv_path = root.joinpath("metadata.csv")
        table = pd.read_csv(
            csv_path,
            sep="|",
            header=None,
            quoting=csv.QUOTE_NONE,
            names=["fname", "raw_text", "normalized_text"])

        pbar = tqdm(range(len(table)))
        alignments = OrderedDict()
        for i in pbar:
            fname, raw_text, normalized_text = table.iloc[i]
            # init input
            text = np.asarray(text_to_sequence(normalized_text))
            text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
            pos_text = np.arange(1, text.shape[1] + 1)
            pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])

            # load
            wav, _ = librosa.load(
                str(os.path.join(args.data, 'wavs', fname + ".wav")))

            spec = librosa.stft(
                y=wav,
                n_fft=cfg['audio']['n_fft'],
                win_length=cfg['audio']['win_length'],
                hop_length=cfg['audio']['hop_length'])
            mag = np.abs(spec)
            mel = librosa.filters.mel(sr=cfg['audio']['sr'],
                                      n_fft=cfg['audio']['n_fft'],
                                      n_mels=cfg['audio']['num_mels'],
                                      fmin=cfg['audio']['fmin'],
                                      fmax=cfg['audio']['fmax'])
            mel = np.matmul(mel, mag)
            mel = np.log(np.maximum(mel, 1e-5))

            mel_input = np.transpose(mel, axes=(1, 0))
            mel_input = fluid.layers.unsqueeze(dg.to_variable(mel_input), [0])
            mel_lens = mel_input.shape[1]

            pos_mel = np.arange(1, mel_input.shape[1] + 1)
            pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
            mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
                text, mel_input, pos_text, pos_mel)
            mel_input = fluid.layers.concat(
                [mel_input, postnet_pred[:, -1:, :]], axis=1)

            alignment, _ = get_alignment(attn_probs, mel_lens,
                                         network_cfg['decoder_num_head'])
            alignments[fname] = alignment
        with open(args.output + '.pkl', "wb") as f:
            pickle.dump(alignments, f)
Пример #16
0
    def synthesize(self,
                   texts,
                   use_gpu=False,
                   speed=1.0,
                   vocoder="griffin-lim"):
        """
        Get the synthetic wavs from the texts.

        Args:
             texts(list): the input texts to be predicted.
             use_gpu(bool): whether use gpu to predict or not. Default False.
             speed(float): Controlling the voice speed. Default 1.0.
             vocoder(str): the vocoder name, "griffin-lim" or "waveflow".

        Returns:
             wavs(str): the audio wav with sample rate . You can use soundfile.write to save it.
             sample_rate(int): the audio sample rate.
        """
        if use_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
            use_gpu = False
            logger.warning(
                "use_gpu has been set False as you didn't set the environment variable CUDA_VISIBLE_DEVICES while using use_gpu=True"
            )
        if use_gpu:
            place = fluid.CUDAPlace(0)
        else:
            place = fluid.CPUPlace()

        if texts and isinstance(texts, list):
            predicted_data = texts
        else:
            raise ValueError(
                "The input data is inconsistent with expectations.")

        wavs = []
        with fluid.dygraph.guard(place):
            self.tts_model.eval()
            self.waveflow.eval()
            for text in predicted_data:
                # init input
                logger.info("Processing sentence: %s" % text)
                text = np.asarray(text_to_sequence(text))
                text = np.expand_dims(text, axis=0)
                pos_text = np.arange(1, text.shape[1] + 1)
                pos_text = np.expand_dims(pos_text, axis=0)

                text = dg.to_variable(text).astype(np.int64)
                pos_text = dg.to_variable(pos_text).astype(np.int64)

                _, mel_output_postnet = self.tts_model(text,
                                                       pos_text,
                                                       alpha=1 / speed)

                if vocoder == 'griffin-lim':
                    # synthesis use griffin-lim
                    wav = self.synthesis_with_griffinlim(
                        mel_output_postnet, self.tts_config['audio'])
                elif vocoder == 'waveflow':
                    wav = self.synthesis_with_waveflow(
                        mel_output_postnet, self.waveflow_config.sigma)
                else:
                    raise ValueError(
                        'vocoder error, we only support griffinlim and waveflow, but recevied %s.'
                        % vocoder)
                wavs.append(wav)
        return wavs, self.tts_config['audio']['sr']
Пример #17
0
    def synthesize(self, texts, use_gpu=False, vocoder="griffin-lim"):
        """
        Get the synthetic wavs from the texts.

        Args:
             texts(list): the input texts to be predicted.
             use_gpu(bool): whether use gpu to predict or not
             vocoder(str): the vocoder name, "griffin-lim" or "waveflow"

        Returns:
             wavs(str): the audio wav with sample rate . You can use soundfile.write to save it.
             sample_rate(int): the audio sample rate.
        """
        if use_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
            use_gpu = False
            logger.warning(
                "use_gpu has been set False as you didn't set the environment variable CUDA_VISIBLE_DEVICES while using use_gpu=True"
            )
        if use_gpu:
            place = fluid.CUDAPlace(0)
        else:
            place = fluid.CPUPlace()

        if texts and isinstance(texts, list):
            predicted_data = texts
        else:
            raise ValueError(
                "The input data is inconsistent with expectations.")

        wavs = []
        with fluid.dygraph.guard(place):
            self.tts_model.eval()
            self.waveflow.eval()
            for text in predicted_data:
                # init input
                logger.info("Processing sentence: %s" % text)
                text = np.asarray(text_to_sequence(text))
                text = fluid.layers.unsqueeze(
                    dg.to_variable(text).astype(np.int64), [0])
                mel_input = dg.to_variable(np.zeros([1, 1,
                                                     80])).astype(np.float32)
                pos_text = np.arange(1, text.shape[1] + 1)
                pos_text = fluid.layers.unsqueeze(
                    dg.to_variable(pos_text).astype(np.int64), [0])

                for i in range(self.max_len):
                    pos_mel = np.arange(1, mel_input.shape[1] + 1)
                    pos_mel = fluid.layers.unsqueeze(
                        dg.to_variable(pos_mel).astype(np.int64), [0])
                    mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = self.tts_model(
                        text, mel_input, pos_text, pos_mel)
                    if stop_preds.numpy()[0, -1] > self.stop_threshold:
                        break
                    mel_input = fluid.layers.concat(
                        [mel_input, postnet_pred[:, -1:, :]], axis=1)
                if vocoder == 'griffin-lim':
                    # synthesis use griffin-lim
                    wav = self.synthesis_with_griffinlim(
                        postnet_pred, self.tts_config['audio'])
                elif vocoder == 'waveflow':
                    # synthesis use waveflow
                    wav = self.synthesis_with_waveflow(
                        postnet_pred, self.waveflow_config.sigma)
                else:
                    raise ValueError(
                        'vocoder error, we only support griffinlim and waveflow, but recevied %s.'
                        % vocoder)
                wavs.append(wav)
        return wavs, self.tts_config['audio']['sr']