Exemplo n.º 1
0
class TextMelLoader(torch.utils.data.Dataset):
    """
        1) loads audio,text pairs
        2) normalizes text and converts them to sequences of one-hot vectors
        3) computes mel-spectrograms from audio files.
    """
    def __init__(self, args, files):
        self.audiopaths_and_text = load_filepaths_and_text(
            args.dataset_path, files)
        self.text_cleaners = args.text_cleaners
        self.max_wav_value = args.max_wav_value
        self.sampling_rate = args.sampling_rate
        # self.load_mel_from_disk = args.load_mel_from_disk
        self.stft = TacotronSTFT(args.filter_length, args.hop_length,
                                 args.win_length, args.n_mel_channels,
                                 args.sampling_rate, args.mel_fmin,
                                 args.mel_fmax)
        random.seed(1234)
        random.shuffle(self.audiopaths_and_text)

    def get_mel_text_pair(self, audiopath_and_text):
        # separate filename and text
        audiopath, text = audiopath_and_text[0], audiopath_and_text[1]
        text_len = len(text)
        text = self.get_text(text)
        mel = self.get_mel(audiopath)
        return (text, mel, text_len)

    def get_mel(self, filename):
        if False:  #not self.load_mel_from_disk:
            audio, sampling_rate = load_wav_to_torch(filename)
            if sampling_rate != self.stft.sampling_rate:
                raise ValueError("{} {} SR doesn't match target {} SR".format(
                    sampling_rate, self.stft.sampling_rate))
            audio_norm = audio / self.max_wav_value
            audio_norm = audio_norm.unsqueeze(0)
            melspec = self.stft.mel_spectrogram(audio_norm)
            melspec = torch.squeeze(melspec, 0)
        else:
            melspec = torch.from_numpy(np.load(filename).T)
            assert melspec.size(0) == self.stft.n_mel_channels, (
                'Mel dimension mismatch: given {}, expected {}'.format(
                    melspec.size(0), self.stft.n_mel_channels))

        return melspec

    def get_text(self, text):
        return text_to_sequence(text, self.text_cleaners)

    def __getitem__(self, index):
        return self.get_mel_text_pair(self.audiopaths_and_text[index])

    def __len__(self):
        return len(self.audiopaths_and_text)
Exemplo n.º 2
0
def load_mel(path):
    stft = TacotronSTFT()
    audio, sampling_rate = load_wav_to_torch(path)
    if sampling_rate != 16000:
        raise ValueError("{} SR doesn't match target {} SR".format(
            sampling_rate, stft.sampling_rate))
    audio_norm = audio / 32768.0  # hparams.max_wav_value
    audio_norm = audio_norm.unsqueeze(0)
    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
    melspec = stft.mel_spectrogram(audio_norm)
    #melspec = melspec.cuda()
    melspec = torch.squeeze(melspec, 0)
    return melspec
Exemplo n.º 3
0
def get_mel_for_test(file_path):
    """
    Library:
        from common.layers import TacotronSTFT
    """
    audio, sr = load_wav_to_torch(file_path)
    audio_norm = audio / MAX_WAV_VALUE
    audio_norm = audio_norm.unsqueeze(0)
    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
    stft = TacotronSTFT(filter_length=1024,
                        hop_length=160,
                        win_length=1024,
                        sampling_rate=sr,
                        mel_fmin=0.0,
                        mel_fmax=8000.0)
    melspec = stft.mel_spectrogram(audio_norm)
    return melspec
Exemplo n.º 4
0
class Mel2Samp(torch.utils.data.Dataset):
    """
    This is the main class that calculates the spectrogram and returns the
    spectrogram, audio pair.
    """
    def __init__(self, segment_length, filter_length, hop_length, win_length,
                 sampling_rate, mel_fmin, mel_fmax, h5_melfile):
        self.stft = TacotronSTFT(filter_length=filter_length,
                                 hop_length=hop_length,
                                 win_length=win_length,
                                 sampling_rate=sampling_rate,
                                 mel_fmin=mel_fmin,
                                 mel_fmax=mel_fmax)
        self.segment_framelength = math.ceil(segment_length / hop_length)
        self.segment_length = self.segment_framelength * hop_length
        self.hop_length = hop_length
        self.sampling_rate = sampling_rate
        self.h5_melfile = h5_melfile
        self.h5_mel = None

    def get_mel(self, audio):
        audio_norm = audio / MAX_WAV_VALUE
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = self.stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)
        return melspec

    def __getitem__(self, index):
        if self.h5_mel is None:
            self.h5_mel = h5py.File(self.h5_melfile, "r")
        audio_gp = self.h5_mel[str(index)]["24k"]
        audio_start = random.randint(0,
                                     audio_gp.shape[0] - self.segment_length)
        audio = torch.FloatTensor(audio_gp[audio_start:audio_start +
                                           self.segment_length])
        mel = self.get_mel(audio)

        audio = audio / MAX_WAV_VALUE
        return (mel, audio)

    def __len__(self):
        return len(self.audio_files)
Exemplo n.º 5
0
def main():
    """
    Launches text to speech (inference).
    Inference is executed on a single GPU.
    """
    parser = argparse.ArgumentParser(
        description='PyTorch Tacotron 2 Inference')
    parser = parse_training_args(parser)
    args, _ = parser.parse_known_args()

    LOGGER.set_model_name("Tacotron2_PyT")
    LOGGER.set_backends([
        dllg.StdOutBackend(log_file=None,
                           logging_scope=dllg.TRAIN_ITER_SCOPE,
                           iteration_interval=1),
        dllg.JsonBackend(log_file=args.log_file,
                         logging_scope=dllg.TRAIN_ITER_SCOPE,
                         iteration_interval=1)
    ])
    LOGGER.register_metric("tacotron2_frames_per_sec",
                           metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("tacotron2_latency",
                           metric_scope=dllg.TRAIN_ITER_SCOPE)
    LOGGER.register_metric("latency", metric_scope=dllg.TRAIN_ITER_SCOPE)

    model, args = load_and_setup_model(parser, args)

    log_hardware()
    log_args(args)

    os.makedirs(args.output_dir, exist_ok=True)

    LOGGER.iteration_start()

    measurements = {}

    anchor_dirs = [
        os.path.join(args.dataset_path, anchor)
        for anchor in args.training_anchor_dirs
    ]
    metadatas = [load_metadata(anchor) for anchor in anchor_dirs]
    stft = TacotronSTFT(args.filter_length, args.hop_length, args.win_length,
                        args.n_mel_channels, args.sampling_rate, args.mel_fmin,
                        args.mel_fmax)
    with torch.no_grad(), MeasureTime(measurements, "tacotron2_time"):
        for speaker_id in range(len(anchor_dirs)):
            metadata = metadatas[speaker_id]
            for npy_path, text in tqdm(metadata):
                seq = text_to_sequence(text, speaker_id, ['basic_cleaners'])
                seqs = torch.from_numpy(np.stack(seq)).unsqueeze(0)
                seq_lens = torch.IntTensor([len(text)])
                wav = load_wav_to_torch(npy_path)
                mel = stft.mel_spectrogram(wav.unsqueeze(0))
                mel = mel.squeeze()
                max_target_len = mel.size(1) - 1
                max_target_len += args.n_frames_per_step - max_target_len % args.n_frames_per_step
                padded_mel = np.pad(mel, [(0, 0),
                                          (0, max_target_len - mel.size(1))],
                                    mode='constant',
                                    constant_values=args.mel_pad_val)
                target = padded_mel[:, ::args.n_frames_per_step]
                targets = torch.from_numpy(np.stack(target)).unsqueeze(0)
                target_lengths = torch.IntTensor([target.shape[1]])
                outputs = model.infer(
                    to_gpu(seqs).long(),
                    to_gpu(seq_lens).int(),
                    to_gpu(targets).half(),
                    to_gpu(target_lengths).int())
                _, mel_out, _, _ = [
                    output.cpu() for output in outputs if output is not None
                ]
                mel_out = mel_out.squeeze()[:, :mel.size(-1) - 1]
                assert (mel_out.shape[-1] == wav.shape[-1] // args.hop_length)
                fname = os.path.basename(npy_path)
                np.save(os.path.join(args.output_dir, fname),
                        mel_out,
                        allow_pickle=False)
                # GTA synthesis
                # magnitudes = stft.inv_mel_spectrogram(mel_out.squeeze())
                # wav = griffin_lim(magnitudes, stft.stft_fn, 60)
                # save_wav(wav, os.path.join(args.output_dir, 'eval.wav'))

    LOGGER.log(key="tacotron2_latency", value=measurements['tacotron2_time'])
    LOGGER.log(key="latency", value=(measurements['tacotron2_time']))
    LOGGER.iteration_stop()
    LOGGER.finish()
Exemplo n.º 6
0
class TextMelDataset(torch.utils.data.Dataset):
    """
        1) loads audio,text pairs
        2) normalizes text and converts them to sequences of one-hot vectors
        3) computes mel-spectrograms from audio files.
    """
    def __init__(self, args, anchor_dirs):
        self.speaker_num = len(anchor_dirs)
        self.meta_dirs = [
            os.path.join(args.dataset_path, anchor_dirs[i])
            for i in range(self.speaker_num)
        ]
        self.metadatas = [
            load_metadata(meta_dir) for meta_dir in self.meta_dirs
        ]
        self.offsets = [0] * self.speaker_num
        self.text_cleaners = args.text_cleaners
        self.sampling_rate = args.sampling_rate
        self.load_mel_from_disk = args.load_mel_from_disk
        self.stft = TacotronSTFT(args.filter_length, args.hop_length,
                                 args.win_length, args.n_mel_channels,
                                 args.sampling_rate, args.mel_fmin,
                                 args.mel_fmax)
        random.seed(1234)
        for i in range(self.speaker_num):
            random.shuffle(self.metadatas[i])

    def get_mel_text_pair(self, speaker_id, metadata):
        mel_path, text = metadata
        seq_len = len(text)
        seq = self.get_sequence(text, speaker_id)
        mel = self.get_mel(mel_path)
        return (seq, mel, seq_len)

    def get_mel(self, filename):
        if not self.load_mel_from_disk:
            audio = load_wav_to_torch(filename)
            melspec = self.stft.mel_spectrogram(audio.unsqueeze(0))
            melspec = torch.squeeze(melspec, 0)
        else:
            melspec = torch.from_numpy(np.load(filename))
            assert melspec.size(0) == self.stft.n_mel_channels, (
                'Mel dimension mismatch: given {}, expected {}'.format(
                    melspec.size(0), self.stft.n_mel_channels))

        return melspec

    def get_sequence(self, text, speaker_id):
        return text_to_sequence(text, speaker_id, self.text_cleaners)

    def __getitem__(self, index):
        group = [
            self.get_mel_text_pair(i, self.metadatas[i][self.offsets[i]])
            for i in range(self.speaker_num)
        ]
        self.offsets = [(self.offsets[i] + 1) % len(self.metadatas[i])
                        for i in range(self.speaker_num)]
        return group

    def __len__(self):
        return sum([len(m) for m in self.metadatas]) // self.speaker_num