示例#1
0
 def __init__(self,
              dataset_path,
              audiopaths_and_text,
              text_cleaners,
              n_mel_channels,
              symbol_set='english_basic',
              n_speakers=1,
              load_mel_from_disk=True,
              max_wav_value=None,
              sampling_rate=None,
              filter_length=None,
              hop_length=None,
              win_length=None,
              mel_fmin=None,
              mel_fmax=None,
              **ignored):
     self.audiopaths_and_text = load_filepaths_and_text(
         dataset_path, audiopaths_and_text,
         has_speakers=(n_speakers > 1))
     self.load_mel_from_disk = load_mel_from_disk
     if not load_mel_from_disk:
         self.max_wav_value = max_wav_value
         self.sampling_rate = sampling_rate
         self.stft = layers.TacotronSTFT(
             filter_length, hop_length, win_length,
             n_mel_channels, sampling_rate, mel_fmin, mel_fmax)
示例#2
0
 def __init__(self, dataset_path, audiopaths_and_text, args):
     self.audiopaths_and_text = load_filepaths_and_text(dataset_path, audiopaths_and_text)
     self.max_wav_value = args.max_wav_value
     self.sampling_rate = args.sampling_rate
     self.stft = layers.TacotronSTFT(
         args.filter_length, args.hop_length, args.win_length,
         args.n_mel_channels, args.sampling_rate, args.mel_fmin,
         args.mel_fmax)
     self.segment_length = args.segment_length
     random.seed(1234)
     random.shuffle(self.audiopaths_and_text)
 def __init__(self, dataset_path, audiopaths_and_text, args, load_mel_from_disk=True):
     self.audiopaths_and_text = load_filepaths_and_text(dataset_path, audiopaths_and_text)
     self.text_cleaners = args.text_cleaners
     self.load_mel_from_disk = load_mel_from_disk
     if not load_mel_from_disk:
         self.max_wav_value = args.max_wav_value
         self.sampling_rate = args.sampling_rate
         self.stft = layers.TacotronSTFT(
             args.filter_length, args.hop_length, args.win_length,
             args.n_mel_channels, args.sampling_rate, args.mel_fmin,
             args.mel_fmax)
示例#4
0
 def __init__(self, audiopaths_and_text, args):
     self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
     self.text_cleaners = args.text_cleaners
     self.max_wav_value = args.max_wav_value
     self.sampling_rate = args.sampling_rate
     self.load_mel_from_disk = args.load_mel_from_disk
     self.stft = layers.TacotronSTFT(args.filter_length, args.hop_length,
                                     args.win_length, args.n_mel_channels,
                                     args.sampling_rate, args.mel_fmin,
                                     args.mel_fmax)
     random.seed(1234)
     random.shuffle(self.audiopaths_and_text)
示例#5
0
 def __init__(self, dataset_path, audiopaths_and_text, segment_length,
              n_mel_channels, max_wav_value, sampling_rate, filter_length,
              hop_length, win_length, mel_fmin, mel_fmax, args):
     self.audiopaths_and_text = load_filepaths_and_text(
         dataset_path, audiopaths_and_text)
     self.max_wav_value = max_wav_value
     self.sampling_rate = sampling_rate
     self.stft = layers.TacotronSTFT(filter_length, hop_length, win_length,
                                     n_mel_channels, sampling_rate,
                                     mel_fmin, mel_fmax)
     self.segment_length = segment_length
     random.seed(1234)
     random.shuffle(self.audiopaths_and_text)
    def __init__(self, data_utterance_paths, hparams):
        """Data loader for the PPG->Mel task.

        Args:
            data_utterance_paths: A text file containing a list of file paths.
            hparams: The hyper-parameters.
        """
        self.data_utterance_paths = load_filepaths(data_utterance_paths)
        self.max_wav_value = hparams.max_wav_value
        self.sampling_rate = hparams.sampling_rate
        self.is_full_ppg = hparams.is_full_ppg
        self.is_append_f0 = hparams.is_append_f0
        self.is_cache_feats = hparams.is_cache_feats
        self.load_feats_from_disk = hparams.load_feats_from_disk
        self.feats_cache_path = hparams.feats_cache_path
        self.ppg_subsampling_factor = hparams.ppg_subsampling_factor
        self.ppg_deps = DependenciesPPG()
        self.encoder_model_fpath = hparams.encoder_model_fpath

        if self.is_cache_feats and self.load_feats_from_disk:
            raise ValueError('If you are loading feats from the disk, do not '
                             'rewrite them back!')

        self.stft = layers.TacotronSTFT(
            hparams.filter_length, hparams.hop_length, hparams.win_length,
            hparams.n_acoustic_feat_dims, hparams.sampling_rate,
            hparams.mel_fmin, hparams.mel_fmax)
        random.seed(hparams.seed)
        random.shuffle(self.data_utterance_paths)

        self.ppg_sequences = []
        self.acoustic_sequences = []
        self.dvec_sequences = []
        if self.load_feats_from_disk:
            print('Loading data from %s.' % self.feats_cache_path)
            with open(self.feats_cache_path, 'rb') as f:
                data = pickle.load(f)
            self.ppg_sequences = data[0]
            self.acoustic_sequences = data[1]
            self.dvec_sequences = data[2]
        else:
            for utterance_path in self.data_utterance_paths:
                ppg_feat_pair_spkr = self.extract_utterance_feats_spkr(utterance_path, self.is_full_ppg)
                self.ppg_sequences.append(ppg_feat_pair_spkr[0].astype(np.float32))
                self.acoustic_sequences.append(ppg_feat_pair_spkr[1])
                self.dvec_sequences.append(ppg_feat_pair_spkr[2])
        if self.is_cache_feats:
            print('Caching data to %s.' % self.feats_cache_path)
            with open(self.feats_cache_path, 'wb') as f:
                pickle.dump([self.ppg_sequences, self.acoustic_sequences, self.dvec_sequences], f)
示例#7
0
    def __init__(self, data_utterance_paths, cache_path, hparams, bs, loop):
        """Data loader for the PPG->Mel task.

        Args:
            data_utterance_paths: A text file containing a list of file paths.
            hparams: The hyper-parameters.
        """
        self.data_utterance_paths = load_filepaths(data_utterance_paths)
        self.max_wav_value = 32768.0
        self.sampling_rate = hparams.sampling_rate
        self.is_full_ppg = True
        self.is_append_f0 = False
        self.is_cache_feats = True
        self.feats_cache_path = cache_path
        self.ppg_subsampling_factor = 1
        self.ppg_deps = DependenciesPPG()
        # 20 data = n(4) * b(5)
        self.n = int(bs) - 1
        self.b = 5
        self.l = int(loop) - 1

        self.stft = layers.TacotronSTFT(
            hparams.filter_length, hparams.hop_length, hparams.win_length,
            hparams.n_acoustic_feat_dims, self.sampling_rate,
            hparams.mel_fmin, hparams.mel_fmax)
        random.seed(hparams.seed)
        random.shuffle(self.data_utterance_paths)
        
        if self.n > 0:
            with open(self.feats_cache_path, 'rb') as f:
                data = pickle.load(f)
            self.ppg_sequences = data[0]
            self.acoustic_sequences = data[1]
        else:
            self.ppg_sequences = []
            self.acoustic_sequences = []

        for utterance_path in self.data_utterance_paths[self.n * self.b + self.l * 20 : (self.n+1) * self.b + self.l * 20]:

            ppg_feat_pair = self.extract_utterance_feats(utterance_path,
                                                self.is_full_ppg)
            self.ppg_sequences.append(ppg_feat_pair[0].astype(
                np.float32))
            self.acoustic_sequences.append(ppg_feat_pair[1])
        
        if self.is_cache_feats:
            print('Caching data to %s.' % self.feats_cache_path)
            with open(self.feats_cache_path, 'wb+') as f:
                pickle.dump([self.ppg_sequences, self.acoustic_sequences], f) 
示例#8
0
    def __init__(self, dataset_path, audiopaths_and_text, args, speaker_ids=None):
        self.audiopaths_and_text = load_filepaths_and_text(dataset_path, audiopaths_and_text)
        self.text_cleaners = args.text_cleaners
        self.max_wav_value = args.max_wav_value
        self.sampling_rate = args.sampling_rate
        self.load_mel_from_disk = args.load_mel_from_disk
        self.stft = layers.TacotronSTFT(
            args.filter_length, args.hop_length, args.win_length,
            args.n_mel_channels, args.sampling_rate, args.mel_fmin,
            args.mel_fmax)
        random.seed(1234)
        random.shuffle(self.audiopaths_and_text)

        self.speaker_ids = speaker_ids
        if speaker_ids is None:
            self.speaker_ids = self.create_speaker_lookup_table(self.audiopaths_and_text)
示例#9
0
    def __init__(self, data_utterance_paths, hparams):
        """Data loader for the PPG->Mel task.

        Args:
            data_utterance_paths: A text file containing a list of file paths.
            hparams: The hyper-parameters.
        """
        self.data_utterance_paths = data_utterance_paths
        self.load_feats_from_disk = hparams.load_feats_from_disk

        self.max_wav_value = hparams.max_wav_value
        self.sampling_rate = hparams.sampling_rate

        self.is_full_ppg = hparams.is_full_ppg
        self.is_append_f0 = hparams.is_append_f0
        self.ppg_subsampling_factor = hparams.ppg_subsampling_factor
        self.ppg_deps = DependenciesPPG()

        self.stft = layers.TacotronSTFT(hparams.filter_length,
                                        hparams.hop_length, hparams.win_length,
                                        hparams.n_acoustic_feat_dims,
                                        hparams.sampling_rate,
                                        hparams.mel_fmin, hparams.mel_fmax)
        random.seed(hparams.seed)

        # 从disk加载数据对
        self.ppg_sequences = []
        self.acoustic_sequences = []
        if self.load_feats_from_disk:
            fpath = self.data_utterance_paths
            with open(fpath, 'rb') as f:
                data = pickle.load(f)
            self.ppg_sequences = data[0]
            self.acoustic_sequences = data[1]
            print("Number of data:", len(self.ppg_sequences))
        else:
            print("No data!")
示例#10
0
    def __init__(
            self,
            dataset_path,
            audiopaths_and_text,
            text_cleaners,
            n_mel_channels,
            symbol_set='english_basic',
            p_arpabet=1.0,
            n_speakers=1,
            load_mel_from_disk=True,
            load_pitch_from_disk=True,
            pitch_mean=214.72203,  # LJSpeech defaults
            pitch_std=65.72038,
            max_wav_value=None,
            sampling_rate=None,
            filter_length=None,
            hop_length=None,
            win_length=None,
            mel_fmin=None,
            mel_fmax=None,
            prepend_space_to_text=False,
            append_space_to_text=False,
            pitch_online_dir=None,
            betabinomial_online_dir=None,
            use_betabinomial_interpolator=True,
            pitch_online_method='pyin',
            **ignored):

        # Expect a list of filenames
        if type(audiopaths_and_text) is str:
            audiopaths_and_text = [audiopaths_and_text]

        self.dataset_path = dataset_path
        self.audiopaths_and_text = load_filepaths_and_text(
            dataset_path, audiopaths_and_text, has_speakers=(n_speakers > 1))
        self.load_mel_from_disk = load_mel_from_disk
        if not load_mel_from_disk:
            self.max_wav_value = max_wav_value
            self.sampling_rate = sampling_rate
            self.stft = layers.TacotronSTFT(filter_length, hop_length,
                                            win_length, n_mel_channels,
                                            sampling_rate, mel_fmin, mel_fmax)
        self.load_pitch_from_disk = load_pitch_from_disk

        self.prepend_space_to_text = prepend_space_to_text
        self.append_space_to_text = append_space_to_text

        assert p_arpabet == 0.0 or p_arpabet == 1.0, (
            'Only 0.0 and 1.0 p_arpabet is currently supported. '
            'Variable probability breaks caching of betabinomial matrices.')

        self.tp = TextProcessing(symbol_set,
                                 text_cleaners,
                                 p_arpabet=p_arpabet)
        self.n_speakers = n_speakers
        self.pitch_tmp_dir = pitch_online_dir
        self.f0_method = pitch_online_method
        self.betabinomial_tmp_dir = betabinomial_online_dir
        self.use_betabinomial_interpolator = use_betabinomial_interpolator

        if use_betabinomial_interpolator:
            self.betabinomial_interpolator = BetaBinomialInterpolator()

        expected_columns = (2 + int(load_pitch_from_disk) + (n_speakers > 1))

        assert not (load_pitch_from_disk and self.pitch_tmp_dir is not None)

        if len(self.audiopaths_and_text[0]) < expected_columns:
            raise ValueError(
                f'Expected {expected_columns} columns in audiopaths file. '
                'The format is <mel_or_wav>|[<pitch>|]<text>[|<speaker_id>]')

        if len(self.audiopaths_and_text[0]) > expected_columns:
            print('WARNING: Audiopaths file has more columns than expected')

        to_tensor = lambda x: torch.Tensor([x]) if type(x) is float else x
        self.pitch_mean = to_tensor(pitch_mean)
        self.pitch_std = to_tensor(pitch_std)