def __init__(self, audiopaths_and_text, hparams, speaker_ids=None): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) self.sampling_rate = hparams.sampling_rate self.filter_length = hparams.filter_length self.hop_length = hparams.hop_length self.f0_min = hparams.f0_min self.f0_max = hparams.f0_max self.harm_thresh = hparams.harm_thresh self.p_arpabet = hparams.p_arpabet self.cmudict = None if hparams.cmudict_path is not None: self.cmudict = cmudict.CMUDict(hparams.cmudict_path) self.speaker_ids = speaker_ids if speaker_ids is None: self.speaker_ids = self.create_speaker_lookup_table( self.audiopaths_and_text) random.seed(1234) random.shuffle(self.audiopaths_and_text)
def __init__(self, audiopaths_and_text, hparams, speaker_ids=None, mode='train'): if mode == 'train': self.audiopaths_and_text = load_filepaths_and_text_train(audiopaths_and_text, split='\t') self.mode = True else: self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text, split='\t') self.mode = False self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.stft = layers.TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) self.sampling_rate = hparams.sampling_rate self.filter_length = hparams.filter_length self.hop_length = hparams.hop_length self.f0_min = hparams.f0_min self.f0_max = hparams.f0_max self.harm_thresh = hparams.harm_thresh self.p_arpabet = hparams.p_arpabet self.cmudict = None if hparams.cmudict_path is not None: self.cmudict = cmudict.CMUDict(hparams.cmudict_path) self.speaker_ids = speaker_ids if self.speaker_ids is None: self.speaker_ids = self.create_speaker_lookup_table(self.audiopaths_and_text)
def __init__(self, audiopaths_and_text, hparams): self.hparams = hparams self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(hparams.seed) random.shuffle(self.audiopaths_and_text) if hparams.use_cmudict: if not os.path.isfile(hparams.cmudict_path): raise Exception( 'If use_cmudict=True, you must download ' + 'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s' % cmudict_path) if hparams.p_cmudict == 1.0: self._cmudict = cmudict.CMUDict(str(cmudict_path), keep_ambiguous=True) else: self._cmudict = cmudict.CMUDict(str(cmudict_path), keep_ambiguous=False) print('Loaded CMUDict with %d unambiguous entries' % len(self._cmudict)) else: self._cmudict = None
def __init__(self, dataset, experiment, hparams, load_durations): self.experiment = experiment self.audiopaths_and_text = load_filepaths_and_text(dataset, experiment, hparams) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.hparams = hparams self.load_durations = load_durations self.durations_dir = os.path.join(experiment.paths["acoustic_features"], "dur") if hparams.preprocessing_type == "vocalid": # vocalid preprocessing is never on the fly self.load_mel_from_disk = True else: self.stft = layers.TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) #TODO: will go to preprocessing self.textanalyzer = TextAnalyzer(use_phones=hparams.use_phonemes, g2p_backend=hparams.g2p_backend, language=hparams.language) self._phone_cache_dir = os.path.join(experiment.paths["acoustic_features"], "utt") self._hparams = hparams print(f"Creating new in-memory phone cache") self._phoneme_cache = {} os.makedirs(self._phone_cache_dir, exist_ok=True) # fill phoneme cache first time before multiprocessing clones this data for paths in self.audiopaths_and_text: self.get_mel_text_pair(paths, dummy_mel=True) random.seed(hparams.seed) random.shuffle(self.audiopaths_and_text)
def __init__(self, audiopaths_and_text, hparams): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft_80 = layers.TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) self.stft_512 = layers.TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, 512, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text)
def __init__(self, hparams, train_set=True): voxceleb1_root = '/hdd/klab/cmtts/data/VoxCeleb1' self.audiopaths_and_text = self.get_wav_txt_pairs(voxceleb1_root) print('load {} pairs from voxceleb1'.format( len(self.audiopaths_and_text))) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text) train_num = int(0.95 * len(self.audiopaths_and_text)) if train_set: self.audiopaths_and_text = self.audiopaths_and_text[:train_num] print('train set using {} pairs'.format( len(self.audiopaths_and_text))) else: self.audiopaths_and_text = self.audiopaths_and_text[train_num:] print('val set using {} pairs'.format( len(self.audiopaths_and_text)))
def multiprocess_gen_mels(audiopaths_internal): import layers stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) return_string = "" total = len(audiopaths_internal) for index, path in enumerate(audiopaths_internal): if index < 0: continue #try: file = path.replace(".npy", ".wav") audio, sampling_rate = load_wav_to_torch(file) if sampling_rate != stft.sampling_rate: raise ValueError("{} {} SR doesn't match target {} SR".format( file, sampling_rate, stft.sampling_rate)) melspec = stft.mel_spectrogram( audio.unsqueeze(0)).squeeze(0).cpu().numpy() np.save(file.replace('.wav', ''), melspec) if not index % 1000: print(total - index) #except Exception as ex: # return_string+=(path+" failed to process\nException: "+str(ex)+"\n") if not return_string: return_string = "No Errors on this process." return return_string
def __init__(self, audiopaths_and_text, hparams, max_len=40): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = layers.TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text) self._max_len = max_len self._epoch = 0 # self._normalize() self._mean = torch.tensor([[-7.0222], [-6.1906], [-5.1736], [-4.2412], [-3.7652], [-3.6533], [-3.6642], [-3.7249], [-3.7714], [-3.7709], [-3.6496], [-3.5707], [-3.5742], [-3.6369], [-3.7370], [-3.9888], [-4.1180], [-4.1938], [-4.3030], [-4.4620], [-4.6258], [-4.7973], [-5.0267], [-5.0906], [-5.1643], [-5.1518], [-5.2571], [-5.2868], [-5.3991], [-5.4988], [-5.5740], [-5.7033], [-5.7849], [-5.8197], [-5.9224], [-5.8171], [-5.7680], [-5.6486], [-5.5940], [-5.5730], [-5.5224], [-5.4793], [-5.5243], [-5.6329], [-5.7697], [-5.8886], [-5.9992], [-6.0405], [-6.0295], [-5.9937], [-5.9651], [-5.8888], [-5.8137], [-5.7405], [-5.7429], [-5.8212], [-5.8967], [-5.9552], [-5.9658], [-5.9283], [-5.9219], [-5.9360], [-5.9943], [-6.0838], [-6.1482], [-6.2169], [-6.2732], [-6.3252], [-6.4438], [-6.6830], [-6.9697], [-7.1962], [-7.3519], [-7.3759], [-7.3302], [-7.1762], [-6.9551], [-6.7458], [-6.6292], [-6.5967]]).float() self._std = torch.tensor([[0.9304], [0.7729], [1.0068], [1.5478], [1.8270], [1.7940], [1.6933], [1.7043], [1.8344], [1.8844], [1.8506], [1.7672], [1.7807], [1.7977], [1.7882], [1.7599], [1.7680], [1.7909], [1.7831], [1.7588], [1.7445], [1.7822], [1.7940], [1.7761], [1.7961], [1.7989], [1.7818], [1.7519], [1.7466], [1.7335], [1.7068], [1.7336], [1.7537], [1.7538], [1.7427], [1.7253], [1.7055], [1.7193], [1.7359], [1.7460], [1.7527], [1.7514], [1.7380], [1.7031], [1.6757], [1.6612], [1.6603], [1.6675], [1.7022], [1.7513], [1.7748], [1.7932], [1.7957], [1.8250], [1.8481], [1.8137], [1.7564], [1.7130], [1.7024], [1.7243], [1.7348], [1.7485], [1.7810], [1.8169], [1.8318], [1.8312], [1.8427], [1.8756], [1.9143], [1.9503], [2.0072], [2.0761], [2.1519], [2.1848], [2.1574], [2.1386], [2.1442], [2.1601], [2.1547], [2.1208]]).float()
def __init__(self, audiopaths_and_text, polyphone_dict_file, mask_dict_file, hparams): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) # with codecs.open(polyphone_dict_file, 'r', 'utf-8') as usernames: # self.polyphone_dict = json.load(usernames) # with codecs.open(mask_dict_file, 'r', 'utf-8') as usernames: # self.mask_dict = json.load(usernames) with codecs.open(hparams.class2idx, 'r', 'utf-8') as usernames: self.class2idx = json.load(usernames) print("num classes: {}".format(len(self.class2idx))) num_classes = len(self.class2idx) with codecs.open(hparams.merge_cedict, 'r', 'utf-8') as usernames: self.merge_cedict = json.load(usernames) self.tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") random.seed(hparams.seed) random.shuffle(self.audiopaths_and_text)
def create_mels(): stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) def save_mel(file): audio, sampling_rate = load_wav_to_torch(file) if sampling_rate != stft.sampling_rate: raise ValueError("{} {} SR doesn't match target {} SR".format( file, sampling_rate, stft.sampling_rate)) audio_norm = audio / hparams.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0).cpu().numpy() np.save(file.replace('.wav', ''), melspec) import glob wavs = glob.glob('/media/cookie/Samsung 860 QVO/ClipperDatasetV2/**/*.wav', recursive=True) print(str(len(wavs)) + " files being converted to mels") for index, i in tqdm(enumerate(wavs), smoothing=0, total=len(wavs)): if index < 0: continue try: save_mel(i) except Exception as ex: tqdm.write(i, " failed to process\n", ex, "\n") assert 0
def _process_utterance(out_dir, wav_path, text): '''Preprocesses a single utterance audio/text pair. This writes the mel feature to disk and returns a tuple to write to the mels.txt file. Args: out_dir: The directory to write the spectrograms into wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (melspec, n_frames, text) tuple to write to mels.txt ''' fid = os.path.splitext(os.path.basename(wav_path))[0] # case if mel already exist if hparams.mel_data_type == 'numpy': mel_path = os.path.join(out_dir, '{}.npy'.format(fid)) if os.path.isfile(mel_path): melspec = torch.from_numpy(np.load(mel_path)) return (mel_path, melspec.shape[1], text) elif hparams.mel_data_type == 'torch': mel_path = os.path.join(out_dir, '{}.pt'.format(fid)) if os.path.isfile(mel_path): #melspec = torch.load(mel_path) # pkl is faster than torch here with open(mel_path, 'rb') as f: melspec = pkl.load(f) return (mel_path, melspec.shape[1], text) # case if mel has not been generated audio, sampling_rate = load_wav_to_torch(wav_path) if sampling_rate != hparams.sampling_rate: raise ValueError("{}: {} SR doesn't match target {} SR".format( wav_path, sampling_rate, hparams.sampling_rate)) audio_norm = audio / hparams.max_wav_value # dim: #samples audio_norm = audio_norm.unsqueeze(0) # dim: 1 X #samples audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) # over-riding win_length/hop_length if win_length_ms/hop_length_ms are specified if hasattr(hparams, 'win_length_ms'): hparams.win_length = int(hparams.win_length_ms / 1000 * hparams.sampling_rate) if hasattr(hparams, 'hop_length_ms'): hparams.hop_length = int(hparams.hop_length_ms / 1000 * hparams.sampling_rate) stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) melspec = stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) if hparams.mel_data_type == 'numpy': np.save(mel_path, melspec.numpy(), allow_pickle=False) elif hparams.mel_data_type == 'torch': #torch.save(melspec, mel_path) # pkl is faster than torch here with open(mel_path, 'wb') as f: pkl.dump(melspec, f, protocol=pkl.HIGHEST_PROTOCOL) # Return a tuple describing this training example: return (mel_path, melspec.shape[1], text)
def _process_utterance(in_dir, out_dir, wav_path, txt_path): '''Preprocesses a single utterance audio/text pair. This writes the mel feature to disk and returns a tuple to write to the mels.txt file. Args: out_dir: The directory to write the spectrograms into wav_path: Path to the audio file containing the speech input txt_path: Path to the text file containing the text of speech input Returns: A (melspec, n_frames, text) tuple to write to mels.txt ''' # get text text = open(txt_path, 'r').readline().rstrip() # case if mel already exist if hparams.mel_data_type == 'numpy': mel_path = wav_path.replace('.wav', '.npy') mel_path = mel_path.replace(in_dir, out_dir) if os.path.isfile(mel_path): melspec = torch.from_numpy(np.load(mel_path)) return (mel_path, melspec.shape[1], text) elif hparams.mel_data_type == 'torch': mel_path = wav_path.replace('.wav', '.pt') mel_path = mel_path.replace(in_dir, out_dir) if os.path.isfile(mel_path): #melspec = torch.load(mel_path) # pkl is faster than torch here with open(mel_path, 'rb') as f: melspec = pkl.load(f) return (mel_path, melspec.shape[1], text) # case if mel has not been generated audio, sampling_rate = load_wav_to_torch(wav_path) if sampling_rate != hparams.sampling_rate: raise ValueError("{}: {} SR doesn't match target {} SR".format( wav_path, sampling_rate, hparams.sampling_rate)) audio_norm = audio / hparams.max_wav_value # dim: #samples audio_norm = audio_norm.unsqueeze(0) # dim: 1 X #samples audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) stft = layers.TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) melspec = stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) if hparams.mel_data_type == 'numpy': np.save(mel_path, melspec.numpy(), allow_pickle=False) elif hparams.mel_data_type == 'torch': #torch.save(melspec, mel_path) # pkl is faster than torch here with open(mel_path, 'wb') as f: pkl.dump(melspec, f, protocol=pkl.HIGHEST_PROTOCOL) # Return a tuple describing this training example: return (mel_path, melspec.shape[1], text)
def __init__(self, audiopaths_and_text, hparams, check_files=True, TBPTT=True, shuffle=False, speaker_ids=None, audio_offset=0, verbose=False): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.truncated_length = hparams.truncated_length self.batch_size = hparams.batch_size self.speaker_ids = speaker_ids self.audio_offset = audio_offset self.shuffle = shuffle if speaker_ids is None: self.speaker_ids = self.create_speaker_lookup_table(self.audiopaths_and_text) self.load_torchmoji = hparams.torchMoji_training and hparams.torchMoji_linear # ---------- CHECK FILES -------------- self.start_token = hparams.start_token self.stop_token = hparams.stop_token if check_files: self.checkdataset(verbose) # -------------- CHECK FILES -------------- self.stft = layers.TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) self.sampling_rate = hparams.sampling_rate self.filter_length = hparams.filter_length self.hop_length = hparams.hop_length # Apply weighting to MLP Datasets duplicated_audiopaths = [x for x in self.audiopaths_and_text if "SlicedDialogue" in x[0]] for i in range(3): self.audiopaths_and_text.extend(duplicated_audiopaths) # SHUFFLE audiopaths random.seed(hparams.seed) self.random_seed = hparams.seed random.shuffle(self.audiopaths_and_text) self.batch_size = hparams.batch_size if speaker_ids is None else hparams.val_batch_size n_gpus = hparams.n_gpus self.rank = hparams.rank self.total_batch_size = self.batch_size * n_gpus # number of audio files being processed together self.truncated_length = hparams.truncated_length # frames # -------------- PREDICT LENGTH (TBPTT) -------------- if hparams.use_TBPTT: self.audio_lengths = torch.tensor([self.get_mel(x[0]).shape[1] for x in self.audiopaths_and_text]) # get the length of every file (the long way) else: self.audio_lengths = torch.tensor([self.truncated_length-1 for x in self.audiopaths_and_text]) # use dummy lengths self.update_dataloader_indexes()
def get_mel(hparams, filename): audio, sampling_rate = load_wav_to_torch(filename) audio_norm = audio / hparams.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = layers.TacotronSTFT(hparams).mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) length = torch.LongTensor(1) length = melspec.size(1) return melspec, length
def __init__(self, audiopaths_and_text, hparams): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk # Perform Checks on Dataset i = 0 i_offset = 0 for i_ in range(len(self.audiopaths_and_text)): i = i_ + i_offset if i == len(self.audiopaths_and_text): break file = self.audiopaths_and_text[i] if self.load_mel_from_disk and '.wav' in file[0]: print(".wav file", file[0], "\n[warning] in filelist while expecting '.npy' . Being Ignored.") self.audiopaths_and_text.remove(file) i_offset-=1 continue elif not self.load_mel_from_disk and '.npy' in file[0]: print(".npy file", file[0], "\n[warning] in filelist while expecting '.wav' . Being Ignored.") self.audiopaths_and_text.remove(file) i_offset-=1 continue if (not os.path.exists(file[0])): print("|".join(file), "\n[warning] does not exist and has been ignored") self.audiopaths_and_text.remove(file) i_offset-=1 continue if not len(file[1]): print("|".join(file), "\n[warning] has no text and has been ignored.") self.audiopaths_and_text.remove(file) i_offset-=1 continue if len(file[1]) < 3: print("|".join(file), "\n[info] has no/very little text.") if not ((file[1].strip())[-1] in r"!?,.;:"): print("|".join(file), "\n[info] has no ending punctuation.") if self.load_mel_from_disk: melspec = torch.from_numpy(np.load(file[0], allow_pickle=True)) mel_length = melspec.shape[1] if mel_length == 0: print("|".join(file), "\n[warning] has 0 duration and has been ignored") self.audiopaths_and_text.remove(file) i_offset-=1 continue # init STFT (not used for load_mel_from_disk) self.stft = layers.TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text)
def __init__(self, audiopaths_and_text, hparams): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = layers.TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text) self.embedding_map = self.load_embedding() print('Load embedding for:',list(self.embedding_map.keys()))
def __init__(self, audiopaths_and_text, hparams, shuffle=True): self.audiopaths_and_text = load_filepaths_and_text( audiopaths_and_text, hparams.sort_by_length) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) if shuffle: random.shuffle(self.audiopaths_and_text)
def __init__(self, audiopaths_and_text, hparams): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) if hparams.ipa_preprocessing: convert_to_ipa(self.audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = layers.TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(hparams.seed) random.shuffle(self.audiopaths_and_text)
def __init__(self, audiopaths_and_text, hparams): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.hparams=hparams ###一个短时傅里叶变换器,用来将waveform转换成mel-spectrogram self.stft = layers.TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(hparams.seed) random.shuffle(self.audiopaths_and_text)
def get_mel(hparams, path): stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) audio, sampling_rate = load_wav_to_torch(path) if sampling_rate != stft.sampling_rate: raise ValueError("{} {} SR doesn't match target {} SR".format( sampling_rate, stft.sampling_rate)) audio_norm = audio / hparams.max_wav_value audio_norm = audio_norm.unsqueeze(0) audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) melspec = stft.mel_spectrogram(audio_norm) melspec = torch.squeeze(melspec, 0) return melspec
def __init__(self, audiopaths_and_text, hparams, shuffle=True): self.audiopaths_and_text = load_filepaths_and_text( audiopaths_and_text, hparams.sort_by_length) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = layers.TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) if shuffle: random.shuffle(self.audiopaths_and_text) self.all_pairs = [self.get_mel_text_pair(self.audiopaths_and_text[index]) for index in tqdm.trange(self.__len__())]
def __init__(self, audiopaths_and_text, hparams): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) print( f'samples {len(self.audiopaths_and_text)} will go over {len(self.audiopaths_and_text)/hparams.batch_size} step on batch size {hparams.batch_size}' ) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text)
def __init__(self, lstfile, hparams): self.fbs, self.fb_text_dict = load_fbs_and_fb_text_dict( lstfile, hparams.lab_path) self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.audio_path = hparams.audio_path self.mel_path = hparams.mel_path self.MelStd_mel = hparams.MelStd_mel self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) random.shuffle(self.fbs)
def __init__(self, audiopaths_and_text, hparams, warp_set="og"): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.n_speakers = hparams.speaker_num self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk # self.mel_time_warping = hparams.mel_time_warping # self.mel_time_length_adjustment = hparams.mel_time_length_adjustment # self.mel_time_length_adjustment_double = hparams.mel_time_length_adjustment_double # self.mel_time_mask = hparams.mel_time_mask # self.mel_freq_mask = hparams.mel_freq_mask # self.mel_freq_warping = hparams.mel_freq_warping self.value_adjustmet = hparams.value_adjustmet self.stft = layers.TacotronSTFT(hparams) self.warp_set = warp_set random.seed(1234) random.shuffle(self.audiopaths_and_text)
def __init__(self, audiopaths_and_text, hparams): self.audiopaths_and_text = load_dataset( audiopaths_and_text, separator=hparams.data_separator) self.speaker_field = hparams.speaker_field self.audio_field = hparams.audio_field self.text_field = hparams.text_field self.speaker_encoder = self.fit_speaker_encoder() self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text)
def __init__(self, audiopaths_and_text, hparams, speaker_ids=None, output_directory=None): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate # add self.filter_length = hparams.filter_length self.hop_length = hparams.hop_length self.f0_min = hparams.f0_min self.f0_max = hparams.f0_max self.harm_thresh = hparams.harm_thresh self.p_arpabet = hparams.p_arpabet self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) self.cmudict = None if hparams.cmudict_path is not None: self.cmudict = cmudict.CMUDict(hparams.cmudict_path) self.speaker_ids = speaker_ids if speaker_ids is None: self.speaker_ids = self.create_speaker_lookup_table( self.audiopaths_and_text) # print speaker_lookup_table if not (output_directory is None) and not (self.speaker_ids is None): speaker_id_path = os.path.join(output_directory, 'speaker_ids.txt') with open(speaker_id_path, 'w', encoding='utf-8') as f: for key, value in self.speaker_ids.items(): f.write('{}: {}\n'.format(key, value)) random.seed(hparams.seed) random.shuffle(self.audiopaths_and_text)
def __init__(self, audiopaths_and_text, hparams, speaker_ids=None): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) self.speaker_ids = speaker_ids if speaker_ids is None: self.speaker_ids = self.create_speaker_lookup_table( self.audiopaths_and_text) random.seed(hparams.seed) random.shuffle(self.audiopaths_and_text)
def __init__(self, audiopaths_and_text, hparams, shuffle=True): self.audiopaths_and_text = load_filepaths_and_text( audiopaths_and_text, hparams.sort_by_length) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.speaker_encoder = layers.SpeakerEncoder(hparams.num_mel, ) self.speaker_encoder.load_model(hparams.se_checkpoint) self.speaker_encoder.eval() self.hparms = hparams self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) if shuffle: random.shuffle(self.audiopaths_and_text)
def __init__(self, audiopaths_and_text, hparams): self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.mel_data_type = hparams.mel_data_type if hasattr(hparams, 'win_length_ms'): hparams.win_length = int(hparams.win_length_ms / 1000 * hparams.sampling_rate) if hasattr(hparams, 'hop_length_ms'): hparams.hop_length = int(hparams.hop_length_ms / 1000 * hparams.sampling_rate) self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(hparams.seed) random.shuffle(self.audiopaths_and_text)
def __init__(self, hparams, is_train=True, combine_ratio=0.5): # audio data: file_path|text data_1 = [] data_2 = [] file_name_1 = None file_name_2 = None if is_train: file_name_1 = hparams.training_files_1 file_name_2 = hparams.training_files_2 else: file_name_1 = hparams.validate_files_1 file_name_2 = hparams.validate_files_2 data_1 = load_filepaths_and_text(file_name_1) data_2 = load_filepaths_and_text(file_name_2) shuffle(data_2) len_data_1 = len(data_1) len_data_2 = int((1 - combine_ratio) * len_data_1 / combine_ratio) len_data_2 = min(len(data_2), len_data_2) data_2 = data_2[:len_data_2] self.audiopaths_and_text = data_1 self.audiopaths_and_text.extend(data_2) self.text_cleaners = hparams.text_cleaners self.max_wav_value = hparams.max_wav_value self.sampling_rate = hparams.sampling_rate self.load_mel_from_disk = hparams.load_mel_from_disk self.stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, hparams.mel_fmax) random.seed(1234) random.shuffle(self.audiopaths_and_text) # print(*self.audiopaths_and_text[:10], sep="\n") self.hparams = hparams