def create_mels(training_filelist, validation_filelist, threads): import glob audiopaths = [] audiopaths.extend( list(set([x[0] for x in load_filepaths_and_text(training_filelist) ]))) # add all unique audio paths for training data audiopaths.extend( list(set([x[0] for x in load_filepaths_and_text(validation_filelist) ]))) # add all unique audio paths for validation data print(str(len(audiopaths)) + " files being converted to mels") multiprocess_arr(multiprocess_gen_mels, audiopaths, threads=threads)
def create_mels(training_filelist, validation_filelist, threads): import glob # audiopaths = glob.glob('/media/cookie/Samsung 860 QVO/ClipperDatasetV2/**/*.wav',recursive=True) audiopaths = [] audiopaths.extend([ x[0] for x in load_filepaths_and_text(training_filelist) ]) # add audio paths for training data audiopaths.extend([ x[0] for x in load_filepaths_and_text(validation_filelist) ]) # add audio paths for validation data print(str(len(audiopaths)) + " files being converted to mels") multiprocess_arr(multiprocess_gen_mels, audiopaths, threads=threads)
def __init__(self, audiopaths_and_text, polyphone_dict_file, mask_dict_file, hparams): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) # with codecs.open(polyphone_dict_file, 'r', 'utf-8') as usernames: # self.polyphone_dict = json.load(usernames) # with codecs.open(mask_dict_file, 'r', 'utf-8') as usernames: # self.mask_dict = json.load(usernames) with codecs.open(hparams.class2idx, 'r', 'utf-8') as usernames: self.class2idx = json.load(usernames) print("num classes: {}".format(len(self.class2idx))) num_classes = len(self.class2idx) with codecs.open(hparams.merge_cedict, 'r', 'utf-8') as usernames: self.merge_cedict = json.load(usernames) self.tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") random.seed(hparams.seed) random.shuffle(self.audiopaths_and_text)
def __init__(self, audiopaths_and_text, hparams, speaker_ids=None, mode='train'): if mode == 'train': self.audiopaths_and_text = load_filepaths_and_text_train(audiopaths_and_text, split='\t') self.mode = True else: self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text, split='\t') self.mode = False self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.stft = layers.TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) self.sampling_rate = hparams.sampling_rate self.filter_length = hparams.filter_length self.hop_length = hparams.hop_length self.f0_min = hparams.f0_min self.f0_max = hparams.f0_max self.harm_thresh = hparams.harm_thresh self.p_arpabet = hparams.p_arpabet self.cmudict = None if hparams.cmudict_path is not None: self.cmudict = cmudict.CMUDict(hparams.cmudict_path) self.speaker_ids = speaker_ids if self.speaker_ids is None: self.speaker_ids = self.create_speaker_lookup_table(self.audiopaths_and_text)
def __init__(self, audiopaths_and_text, hparams, max_len=40): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = layers.TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text) self._max_len = max_len self._epoch = 0 # self._normalize() self._mean = torch.tensor([[-7.0222], [-6.1906], [-5.1736], [-4.2412], [-3.7652], [-3.6533], [-3.6642], [-3.7249], [-3.7714], [-3.7709], [-3.6496], [-3.5707], [-3.5742], [-3.6369], [-3.7370], [-3.9888], [-4.1180], [-4.1938], [-4.3030], [-4.4620], [-4.6258], [-4.7973], [-5.0267], [-5.0906], [-5.1643], [-5.1518], [-5.2571], [-5.2868], [-5.3991], [-5.4988], [-5.5740], [-5.7033], [-5.7849], [-5.8197], [-5.9224], [-5.8171], [-5.7680], [-5.6486], [-5.5940], [-5.5730], [-5.5224], [-5.4793], [-5.5243], [-5.6329], [-5.7697], [-5.8886], [-5.9992], [-6.0405], [-6.0295], [-5.9937], [-5.9651], [-5.8888], [-5.8137], [-5.7405], [-5.7429], [-5.8212], [-5.8967], [-5.9552], [-5.9658], [-5.9283], [-5.9219], [-5.9360], [-5.9943], [-6.0838], [-6.1482], [-6.2169], [-6.2732], [-6.3252], [-6.4438], [-6.6830], [-6.9697], [-7.1962], [-7.3519], [-7.3759], [-7.3302], [-7.1762], [-6.9551], [-6.7458], [-6.6292], [-6.5967]]).float() self._std = torch.tensor([[0.9304], [0.7729], [1.0068], [1.5478], [1.8270], [1.7940], [1.6933], [1.7043], [1.8344], [1.8844], [1.8506], [1.7672], [1.7807], [1.7977], [1.7882], [1.7599], [1.7680], [1.7909], [1.7831], [1.7588], [1.7445], [1.7822], [1.7940], [1.7761], [1.7961], [1.7989], [1.7818], [1.7519], [1.7466], [1.7335], [1.7068], [1.7336], [1.7537], [1.7538], [1.7427], [1.7253], [1.7055], [1.7193], [1.7359], [1.7460], [1.7527], [1.7514], [1.7380], [1.7031], [1.6757], [1.6612], [1.6603], [1.6675], [1.7022], [1.7513], [1.7748], [1.7932], [1.7957], [1.8250], [1.8481], [1.8137], [1.7564], [1.7130], [1.7024], [1.7243], [1.7348], [1.7485], [1.7810], [1.8169], [1.8318], [1.8312], [1.8427], [1.8756], [1.9143], [1.9503], [2.0072], [2.0761], [2.1519], [2.1848], [2.1574], [2.1386], [2.1442], [2.1601], [2.1547], [2.1208]]).float()
def save_checkpoint(model, optimizer, learning_rate, iteration, hparams, best_validation_loss, average_loss, speaker_id_lookup, filepath): from utils import load_filepaths_and_text tqdm.write("Saving model and optimizer state at iteration {} to {}".format( iteration, filepath)) # get speaker names to ID speakerlist = load_filepaths_and_text(hparams.speakerlist) speaker_name_lookup = { x[1]: speaker_id_lookup[x[2]] for x in speakerlist if x[2] in speaker_id_lookup.keys() } torch.save( { 'iteration': iteration, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'learning_rate': learning_rate, #'amp': amp.state_dict(), 'hparams': hparams, 'speaker_id_lookup': speaker_id_lookup, 'speaker_name_lookup': speaker_name_lookup, 'best_validation_loss': best_validation_loss, 'average_loss': average_loss }, filepath) tqdm.write("Saving Complete")
def __init__(self, audiopaths_and_text, hparams, speaker_ids=None): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) self.sampling_rate = hparams.sampling_rate self.filter_length = hparams.filter_length self.hop_length = hparams.hop_length self.f0_min = hparams.f0_min self.f0_max = hparams.f0_max self.harm_thresh = hparams.harm_thresh self.p_arpabet = hparams.p_arpabet self.cmudict = None if hparams.cmudict_path is not None: self.cmudict = cmudict.CMUDict(hparams.cmudict_path) self.speaker_ids = speaker_ids if speaker_ids is None: self.speaker_ids = self.create_speaker_lookup_table( self.audiopaths_and_text) random.seed(1234) random.shuffle(self.audiopaths_and_text)
def __init__(self, audiopaths_and_text, hparams): self.hparams = hparams self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(hparams.seed) random.shuffle(self.audiopaths_and_text) if hparams.use_cmudict: if not os.path.isfile(hparams.cmudict_path): raise Exception( 'If use_cmudict=True, you must download ' + 'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s' % cmudict_path) if hparams.p_cmudict == 1.0: self._cmudict = cmudict.CMUDict(str(cmudict_path), keep_ambiguous=True) else: self._cmudict = cmudict.CMUDict(str(cmudict_path), keep_ambiguous=False) print('Loaded CMUDict with %d unambiguous entries' % len(self._cmudict)) else: self._cmudict = None
def __init__(self, dataset, experiment, hparams, load_durations): self.experiment = experiment self.audiopaths_and_text = load_filepaths_and_text(dataset, experiment, hparams) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.hparams = hparams self.load_durations = load_durations self.durations_dir = os.path.join(experiment.paths["acoustic_features"], "dur") if hparams.preprocessing_type == "vocalid": # vocalid preprocessing is never on the fly self.load_mel_from_disk = True else: self.stft = layers.TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) #TODO: will go to preprocessing self.textanalyzer = TextAnalyzer(use_phones=hparams.use_phonemes, g2p_backend=hparams.g2p_backend, language=hparams.language) self._phone_cache_dir = os.path.join(experiment.paths["acoustic_features"], "utt") self._hparams = hparams print(f"Creating new in-memory phone cache") self._phoneme_cache = {} os.makedirs(self._phone_cache_dir, exist_ok=True) # fill phoneme cache first time before multiprocessing clones this data for paths in self.audiopaths_and_text: self.get_mel_text_pair(paths, dummy_mel=True) random.seed(hparams.seed) random.shuffle(self.audiopaths_and_text)
def __init__(self, melpaths_and_text, hparams): self.melpaths_and_text = load_filepaths_and_text(melpaths_and_text) self.text_cleaners = hparams.text_cleaners self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") random.seed(hparams.seed) random.shuffle(self.melpaths_and_text)
def __init__(self, audiopaths_and_text, hparams, check_files=True, TBPTT=True, shuffle=False, speaker_ids=None, audio_offset=0, verbose=False): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.truncated_length = hparams.truncated_length self.batch_size = hparams.batch_size self.speaker_ids = speaker_ids self.audio_offset = audio_offset self.shuffle = shuffle if speaker_ids is None: self.speaker_ids = self.create_speaker_lookup_table(self.audiopaths_and_text) self.load_torchmoji = hparams.torchMoji_training and hparams.torchMoji_linear # ---------- CHECK FILES -------------- self.start_token = hparams.start_token self.stop_token = hparams.stop_token if check_files: self.checkdataset(verbose) # -------------- CHECK FILES -------------- self.stft = layers.TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) self.sampling_rate = hparams.sampling_rate self.filter_length = hparams.filter_length self.hop_length = hparams.hop_length # Apply weighting to MLP Datasets duplicated_audiopaths = [x for x in self.audiopaths_and_text if "SlicedDialogue" in x[0]] for i in range(3): self.audiopaths_and_text.extend(duplicated_audiopaths) # SHUFFLE audiopaths random.seed(hparams.seed) self.random_seed = hparams.seed random.shuffle(self.audiopaths_and_text) self.batch_size = hparams.batch_size if speaker_ids is None else hparams.val_batch_size n_gpus = hparams.n_gpus self.rank = hparams.rank self.total_batch_size = self.batch_size * n_gpus # number of audio files being processed together self.truncated_length = hparams.truncated_length # frames # -------------- PREDICT LENGTH (TBPTT) -------------- if hparams.use_TBPTT: self.audio_lengths = torch.tensor([self.get_mel(x[0]).shape[1] for x in self.audiopaths_and_text]) # get the length of every file (the long way) else: self.audio_lengths = torch.tensor([self.truncated_length-1 for x in self.audiopaths_and_text]) # use dummy lengths self.update_dataloader_indexes()
def __init__(self, melpaths_and_text, hparams): self.melpaths_and_text = load_filepaths_and_text(melpaths_and_text) self.text_cleaners = hparams.text_cleaners with codecs.open(hparams.class2idx, 'r', 'utf-8') as usernames: self.class2idx = json.load(usernames) self.tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") random.seed(hparams.seed) random.shuffle(self.melpaths_and_text)
def __init__(self, hparams, is_train=True, combine_ratio=0.5): # audio data: file_path|text data_1 = [] data_2 = [] file_name_1 = None file_name_2 = None if is_train: file_name_1 = hparams.training_files_1 file_name_2 = hparams.training_files_2 else: file_name_1 = hparams.validate_files_1 file_name_2 = hparams.validate_files_2 data_1 = load_filepaths_and_text(file_name_1) data_2 = load_filepaths_and_text(file_name_2) shuffle(data_2) len_data_1 = len(data_1) len_data_2 = int((1 - combine_ratio) * len_data_1 / combine_ratio) len_data_2 = min(len(data_2), len_data_2) data_2 = data_2[:len_data_2] self.audiopaths_and_text = data_1 self.audiopaths_and_text.extend(data_2) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text) # print(*self.audiopaths_and_text[:10], sep="\n") self.hparams = hparams
def update_tt(self, tacotron_name): self.model, self.ttm_hparams, self.ttm_sp_name_lookup, self.ttm_sp_id_lookup = self.load_tacotron2( self.conf['TTM']['models'][tacotron_name]['modelpath']) self.ttm_current = tacotron_name if self.conf['TTM'][ 'use_speaker_ids_file_override']: # (optional) override self.ttm_sp_name_lookup = { name: self.ttm_sp_id_lookup[int(ext_id)] for _, name, ext_id in load_filepaths_and_text( self.conf['TTM']['speaker_ids_file']) }
def __init__(self, audiopaths_and_text, hparams): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.sampling_rate = hparams.sampling_rate # self.max_wav_value = hparams.max_wav_value self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text)
def __init__(self, audiopaths_and_text, hparams): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk # Perform Checks on Dataset i = 0 i_offset = 0 for i_ in range(len(self.audiopaths_and_text)): i = i_ + i_offset if i == len(self.audiopaths_and_text): break file = self.audiopaths_and_text[i] if self.load_mel_from_disk and '.wav' in file[0]: print(".wav file", file[0], "\n[warning] in filelist while expecting '.npy' . Being Ignored.") self.audiopaths_and_text.remove(file) i_offset-=1 continue elif not self.load_mel_from_disk and '.npy' in file[0]: print(".npy file", file[0], "\n[warning] in filelist while expecting '.wav' . Being Ignored.") self.audiopaths_and_text.remove(file) i_offset-=1 continue if (not os.path.exists(file[0])): print("|".join(file), "\n[warning] does not exist and has been ignored") self.audiopaths_and_text.remove(file) i_offset-=1 continue if not len(file[1]): print("|".join(file), "\n[warning] has no text and has been ignored.") self.audiopaths_and_text.remove(file) i_offset-=1 continue if len(file[1]) < 3: print("|".join(file), "\n[info] has no/very little text.") if not ((file[1].strip())[-1] in r"!?,.;:"): print("|".join(file), "\n[info] has no ending punctuation.") if self.load_mel_from_disk: melspec = torch.from_numpy(np.load(file[0], allow_pickle=True)) mel_length = melspec.shape[1] if mel_length == 0: print("|".join(file), "\n[warning] has 0 duration and has been ignored") self.audiopaths_and_text.remove(file) i_offset-=1 continue # init STFT (not used for load_mel_from_disk) self.stft = layers.TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text)
def __init__(self, audiopaths_and_text, hparams): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams['text_cleaners'] self.max_wav_value = hparams['max_wav_value'] self.sampling_rate = hparams['sampling_rate'] self.load_mel_from_disk = hparams['load_mel_from_disk'] self.stft = TacotronSTFT(hparams['filter_length'], hparams['hop_length'], hparams['win_length'], hparams['n_mel_channels'], hparams['sampling_rate'], hparams['mel_fmin'], hparams['mel_fmax']) random.seed(hparams['seed']) random.shuffle(self.audiopaths_and_text)
def __init__(self, audiopaths_and_text, hparams, shuffle=True): self.audiopaths_and_text = load_filepaths_and_text( audiopaths_and_text, hparams.sort_by_length) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) if shuffle: random.shuffle(self.audiopaths_and_text)
def __init__(self, split, hparams): audiopaths_and_text = hp.tran_file_format.format( split) # train, cv & test print(audiopaths_and_text) self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text)
def __init__(self, audiopaths, conf, valid=False): data_conf = conf['data'] self.is_norm = data_conf['is_norm'] self.is_valid = valid if 'mel' in data_conf['batch']: self.use_mel = True else: self.use_mel = False self.data_name = data_conf['data_name'] self.sampling_rate = data_conf['sampling_rate'] if data_conf[ 'sampling_rate'] else None self.n_fft = data_conf['n_fft'] if data_conf['n_fft'] else None self.hop_length = data_conf['hop_length'] if data_conf[ 'hop_length'] else None self.win_length = data_conf['win_length'] if data_conf[ 'win_length'] else None self.n_mel = data_conf['n_mel'] if data_conf['n_mel'] else None self.audio_refdB = data_conf['audio_refdB'] if data_conf[ 'audio_refdB'] else None self.audio_maxdB = data_conf['audio_maxdB'] if data_conf[ 'audio_maxdB'] else None self.reduction_factor = data_conf['reduction_factor'] if data_conf[ 'reduction_factor'] else None self.segment_length = data_conf['segment_length'] if data_conf[ 'segment_length'] else None self.text_cleaners = data_conf['text_cleaners'] if data_conf[ 'text_cleaners'] else None self.use_audio = True if 'audio' in data_conf['batch'] else False self.use_audio_seg = True if 'audio_seg' in data_conf[ 'batch'] else False self.use_mel_seg = True if 'mel_seg' in data_conf['batch'] else False self.use_coarse_mel = True if self.reduction_factor is not None and self.reduction_factor > 1 else False self.use_mel = True if 'mel' in data_conf['batch'] else False self.use_text = True if 'text' in data_conf['batch'] else False self.use_attn_guide = True if 'attn_guide' in data_conf[ 'batch'] else False self.use_attn_mask = True if 'attn_mask' in data_conf[ 'batch'] else False self.use_tvmt = True if 'tvmt' in data_conf['batch'] else False self.use_attn_mask2 = True if 'attn_mask2' in data_conf[ 'batch'] else False self.load_mel_from_disk = conf['load_mel_from_disk'] self.audiopaths = load_filepaths_and_text(audiopaths) random.seed(1234) random.shuffle(self.audiopaths)
def __init__(self, audiopaths_and_text, hparams): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) if hparams.ipa_preprocessing: convert_to_ipa(self.audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = layers.TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(hparams.seed) random.shuffle(self.audiopaths_and_text)
def __init__(self, audiopaths_and_text, hparams): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = layers.TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text) self.embedding_map = self.load_embedding() print('Load embedding for:',list(self.embedding_map.keys()))
def __init__(self, audiopaths_and_text, hparams, return_file_name=None): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) self.return_file_name = return_file_name random.seed(hparams.seed) random.shuffle(self.audiopaths_and_text)
def prepare_mel_meta(hparams, audiopath_and_text): audiopath_and_texts = load_filepaths_and_text(audiopath_and_text) with open(os.path.join('./filelists', 'metadata_mel10_val.csv'), 'w', encoding='utf-8') as csvfile: for i in range(len(audiopath_and_texts)): audiopath, text, speaker_id = audiopath_and_texts[i][0], audiopath_and_texts[i][1], audiopath_and_texts[i][2] out_dir = audiopath[:11] file_name = audiopath[12:-4] file_path = os.path.join(out_dir, file_name+'.npy') wr = csv.writer(csvfile, delimiter='|') wr.writerow([file_path, text, speaker_id]) pass
def __init__(self, audiopaths_and_text, hparams, shuffle=True): self.audiopaths_and_text = load_filepaths_and_text( audiopaths_and_text, hparams.sort_by_length) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = layers.TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) if shuffle: random.shuffle(self.audiopaths_and_text) self.all_pairs = [self.get_mel_text_pair(self.audiopaths_and_text[index]) for index in tqdm.trange(self.__len__())]
def __init__(self, audiopaths_and_text, hparams): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.hparams=hparams ###一个短时傅里叶变换器,用来将waveform转换成mel-spectrogram self.stft = layers.TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(hparams.seed) random.shuffle(self.audiopaths_and_text)
def __init__(self, audiopaths_and_text, hparams): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) print( f'samples {len(self.audiopaths_and_text)} will go over {len(self.audiopaths_and_text)/hparams.batch_size} step on batch size {hparams.batch_size}' ) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text)
def __init__( self, audiopaths_and_texts, config, shuffle=True ): self.audiopaths_and_texts = load_filepaths_and_text( audiopaths_and_texts, config['sort_by_length']) self.text_cleaners = config['text_cleaners'] random.seed(1234) if shuffle: random.shuffle(self.audiopaths_and_texts) self.gt_module = ConvModule(config) self.gt_module.load_state_dict(torch.load('conv_module.pt', map_location=lambda storage, loc: storage)) _ = self.gt_module.cpu().eval()
def __init__(self, audiopaths_and_text, hparams): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.add_noise = hparams.add_noise self.add_space = hparams.add_space if getattr(hparams, "cmudict_path", None) is not None: self.cmudict = cmudict.CMUDict(hparams.cmudict_path) self.stft = commons.TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text)
def __init__(self, conf): self.conf = conf # load Tacotron2 self.ttm_current = self.conf['TTM']['default_model'] assert self.ttm_current in self.conf['TTM']['models'].keys( ), "Tacotron default model not found in config models" tacotron_path = self.conf['TTM']['models'][self.ttm_current][ 'modelpath'] # get first available Tacotron self.tacotron, self.ttm_hparams, self.ttm_sp_name_lookup, self.ttm_sp_id_lookup = self.load_tacotron2( tacotron_path) # load WaveGlow self.MTW_current = self.conf['MTW']['default_model'] assert self.MTW_current in self.conf['MTW']['models'].keys( ), "WaveGlow default model not found in config models" vocoder_path = self.conf['MTW']['models'][self.MTW_current][ 'modelpath'] # get first available waveglow vocoder_confpath = self.conf['MTW']['models'][ self.MTW_current]['configpath'] self.waveglow, self.MTW_denoiser, self.MTW_train_sigma, self.MTW_sp_id_lookup = self.load_waveglow( vocoder_path, vocoder_confpath) # load torchMoji if self.ttm_hparams.torchMoji_linear: # if Tacotron includes a torchMoji layer self.tm_sentence_tokenizer, self.tm_torchmoji = self.load_torchmoji( ) # override since my checkpoints are still missing speaker names if self.conf['TTM']['use_speaker_ids_file_override']: speaker_ids_fpath = self.conf['TTM']['speaker_ids_file'] self.ttm_sp_name_lookup = { name: self.ttm_sp_id_lookup[int(ext_id)] for _, name, ext_id in load_filepaths_and_text( speaker_ids_fpath) } # load arpabet/pronounciation dictionary dict_path = self.conf['dict_path'] self.load_arpabet_dict(dict_path) # download nltk package for splitting text into sentences nltk.download('punkt') print("T2S Initialized and Ready!")