예제 #1
0
 def __getitem__(self, index):
     id = self._metadata[index][4].split(".")[0]
     x_ = self._metadata[index][3].split()
     if self.use_phonemes:
         x = phonemes_to_sequence(x_)
     else:
         x = text_to_sequence(x_, self.tts_cleaner_names, self.eos)
     mel = np.load(f"{self.path}mels/{id}.npy")
     durations = str_to_int_list(self._metadata[index][2])
     e = remove_outlier(
         np.load(f"{self.path}energy/{id}.npy")
     )  # self._norm_mean_std(np.load(f'{self.path}energy/{id}.npy'), self.e_mean, self.e_std, True)
     p = remove_outlier(
         np.load(f"{self.path}pitch/{id}.npy")
     )  # self._norm_mean_std(np.load(f'{self.path}pitch/{id}.npy'), self.f0_mean, self.f0_std, True)
     mel_len = mel.shape[1]
     durations = durations[:len(x)]
     durations[-1] = durations[-1] + (mel.shape[1] - sum(durations))
     assert mel.shape[1] == sum(durations)
     return (
         np.array(x),
         mel.T,
         id,
         mel_len,
         np.array(durations),
         e,
         p,
     )  # Mel [T, num_mel]
예제 #2
0
 def __getitem__(self, index):
     id = self._metadata[index][4].split(".")[0]
     x_ = self._metadata[index][3].split()
     if hp.use_phonemes:
         x = phonemes_to_sequence(x_)
     else:
         x = text_to_sequence(x_, hp.tts_cleaner_names)
     mel = np.load(f'{self.path}mels/{id}.npy')
     durations = str_to_int_list(self._metadata[index][2])
     e = np.load(f'{self.path}energy/{id}.npy')
     p = np.load(f'{self.path}pitch/{id}.npy')
     mel_len = mel.shape[1]
     durations[-1] = durations[-1] + (mel.shape[1] - sum(durations))
     return np.array(x), mel.T, id, mel_len, np.array(durations), e, p # Mel [T, num_mel]
예제 #3
0
def synthesis(args, text, hp):
    """Decode with E2E-TTS model."""
    set_deterministic_pytorch(args)
    # read training config
    idim = hp.symbol_len
    odim = hp.num_mels
    model = FeedForwardTransformer(idim, odim, hp)
    print(model)

    if os.path.exists(args.path):
        print("\nSynthesis Session...\n")
        model.load_state_dict(torch.load(args.path), strict=False)
    else:
        print("Checkpoint not exixts")
        return None

    model.eval()

    # set torch device
    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
    model = model.to(device)

    input = np.asarray(phonemes_to_sequence(text.split()))
    text = torch.LongTensor(input)
    text = text.cuda()
    # [num_char]

    with torch.no_grad():
        # decode and write
        idx = input[:5]
        start_time = time.time()
        print("text :", text.size())
        outs, probs, att_ws = model.inference(text, hp)
        print("Out size : ", outs.size())

        logging.info("inference speed = %s msec / frame." %
                     ((time.time() - start_time) / (int(outs.size(0)) * 1000)))
        if outs.size(0) == text.size(0) * args.maxlenratio:
            logging.warning("output length reaches maximum length .")

        print("mels", outs.size())
        mel = outs.cpu().numpy()  # [T_out, num_mel]
        print("numpy ", mel.shape)

        return mel
예제 #4
0
def synth(text, model, hp):
    """Decode with E2E-TTS model."""

    print("TTS synthesis")

    model.eval()
    # set torch device
    device = torch.device("cuda" if hp.train.ngpu > 0 else "cpu")
    model = model.to(device)

    input = np.asarray(phonemes_to_sequence(text))

    text = torch.LongTensor(input)
    text = text.to(device)

    with torch.no_grad():
        print("predicting")
        outs = model.inference(text)  # model(text) for jit script
        mel = outs
    return mel
예제 #5
0
def synthesis_tts(args, text, path):
    """Decode with E2E-TTS model."""
    set_deterministic_pytorch(args)
    print("TTS synthesis")
    # read training config
    idim = hp.symbol_len
    odim = hp.num_mels
    print("Text :", text)
    input = np.asarray(phonemes_to_sequence(text.split()))
    print("Input :", input)
    model = FeedForwardTransformer(idim, odim)

    if os.path.exists(path):
        logging.info('\nSynthesis Session...\n')
        model.load_state_dict(torch.load(path), strict=False)
    else:
        logging.info("Checkpoint not exixts")
        return None

    model.eval()

    # set torch device
    device = torch.device("cuda" if args.ngpu > 0 else "cpu")
    model = model.to(device)

    text = torch.LongTensor(input)
    text = text.cuda()
    #[num_char]

        # define function for plot prob and att_ws
    def _plot_and_save(array, figname, figsize=(6, 4), dpi=150):
        import matplotlib.pyplot as plt
        shape = array.shape
        if len(shape) == 1:
            # for eos probability
            plt.figure(figsize=figsize, dpi=dpi)
            plt.plot(array)
            plt.xlabel("Frame")
            plt.ylabel("Probability")
            plt.ylim([0, 1])
        elif len(shape) == 2:
            # for tacotron 2 attention weights, whose shape is (out_length, in_length)
            plt.figure(figsize=figsize, dpi=dpi)
            plt.imshow(array, aspect="auto")
            plt.xlabel("Input")
            plt.ylabel("Output")
        elif len(shape) == 4:
            # for transformer attention weights, whose shape is (#leyers, #heads, out_length, in_length)
            plt.figure(figsize=(figsize[0] * shape[0], figsize[1] * shape[1]), dpi=dpi)
            for idx1, xs in enumerate(array):
                for idx2, x in enumerate(xs, 1):
                    plt.subplot(shape[0], shape[1], idx1 * shape[1] + idx2)
                    plt.imshow(x, aspect="auto")
                    plt.xlabel("Input")
                    plt.ylabel("Output")
        else:
            raise NotImplementedError("Support only from 1D to 4D array.")
        plt.tight_layout()
        if not os.path.exists(os.path.dirname(figname)):
            # NOTE: exist_ok = True is needed for parallel process decoding
            os.makedirs(os.path.dirname(figname), exist_ok=True)
        plt.savefig(figname)
        plt.close()

    with torch.no_grad():
        # decode and write
        idx = input[:5]
        start_time = time.time()
        print("predicting")
        outs, probs, att_ws = model.inference(text, args)

        logging.info("inference speed = %s msec / frame." % (
            (time.time() - start_time) / (int(outs.size(0)) * 1000)))
        if outs.size(0) == text.size(0) * 5:
            logging.warning("output length reaches maximum length .")

        mel = outs#.cpu().numpy() # [T_out, num_mel]

        

        return mel