def __init__(self, audiopaths_and_text, hparams, speaker_ids=None, mode='train'): self.hparams = hparams tmp = mode.split('-') if tmp[0] == 'train': self.audiopaths_and_text = load_filepaths_and_text_train( audiopaths_and_text, split='\t') if len(tmp) == 2: self.mode = tmp[1] else: self.mode = True else: if isinstance(audiopaths_and_text, (str, Path)) and os.path.isfile(audiopaths_and_text): self.audiopaths_and_text = load_filepaths_and_text( audiopaths_and_text, split='\t') else: self.audiopaths_and_text = ['audiopath', 'text', 'speaker'] self.mode = False self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) self.sampling_rate = hparams.sampling_rate self.filter_length = hparams.filter_length self.hop_length = hparams.hop_length self.f0_min = hparams.f0_min self.f0_max = hparams.f0_max self.harm_thresh = hparams.harm_thresh self.p_arpabet = hparams.p_arpabet self.max_decoder_steps = hparams.max_decoder_steps self.f0_dim = hparams.prenet_f0_dim # f0的维度设置 self.encoder_model_fpath = hparams.encoder_model_fpath self.cmudict = None if hparams.cmudict_path is not None: self.cmudict = cmudict.CMUDict(hparams.cmudict_path) self.speaker_ids = speaker_ids if self.speaker_ids is None: self.speaker_ids = self.create_speaker_lookup_table( self.audiopaths_and_text) # random.seed(1234) # random.shuffle(self.audiopaths_and_text) self.ids = set(range(len(self.audiopaths_and_text)))
def __init__(self, audiopaths_and_text, hparams, speaker_ids=None, mode='train'): self.hparams = hparams tmp = mode.split('-') if tmp[0] == 'train': self.audiopaths_and_text = load_filepaths_and_text_train( audiopaths_and_text, split='\t') if len(tmp) == 2: self.mode = tmp[1] else: self.mode = True else: self.audiopaths_and_text = load_filepaths_and_text( audiopaths_and_text, split='\t') self.mode = False self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) self.sampling_rate = hparams.sampling_rate self.filter_length = hparams.filter_length self.hop_length = hparams.hop_length self.f0_min = hparams.f0_min self.f0_max = hparams.f0_max self.harm_thresh = hparams.harm_thresh self.p_arpabet = hparams.p_arpabet self.f0_dim = hparams.prenet_f0_dim # f0的维度设置 self.cmudict = None if hparams.cmudict_path is not None: self.cmudict = cmudict.CMUDict(hparams.cmudict_path) self.speaker_ids = speaker_ids if self.speaker_ids is None: self.speaker_ids = self.create_speaker_lookup_table( self.audiopaths_and_text)